From 1214a5fe59af43ba23346d19a39de12f62ef0917 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Tue, 18 Jun 2024 15:39:05 +0200
Subject: [PATCH 01/98] reorder init steps irm for readability

---
 doubleml/irm/irm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doubleml/irm/irm.py b/doubleml/irm/irm.py
index 82d22d1dd..eface6917 100644
--- a/doubleml/irm/irm.py
+++ b/doubleml/irm/irm.py
@@ -143,7 +143,6 @@ def __init__(self,
         ml_g_is_classifier = self._check_learner(ml_g, 'ml_g', regressor=True, classifier=True)
         _ = self._check_learner(ml_m, 'ml_m', regressor=False, classifier=True)
         self._learner = {'ml_g': ml_g, 'ml_m': ml_m}
-        self._normalize_ipw = normalize_ipw
         if ml_g_is_classifier:
             if obj_dml_data.binary_outcome:
                 self._predict_method = {'ml_g': 'predict_proba', 'ml_m': 'predict_proba'}
@@ -154,6 +153,7 @@ def __init__(self,
             self._predict_method = {'ml_g': 'predict', 'ml_m': 'predict_proba'}
         self._initialize_ml_nuisance_params()
 
+        self._normalize_ipw = normalize_ipw
         if not isinstance(self.normalize_ipw, bool):
             raise TypeError('Normalization indicator has to be boolean. ' +
                             f'Object of type {str(type(self.normalize_ipw))} passed.')

From 86cd871d6e0d6bfbcde6e4578ab0b4bc1144492a Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Tue, 18 Jun 2024 16:17:46 +0200
Subject: [PATCH 02/98] first version of discrete treatment dataset

---
 doubleml/datasets.py | 60 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/doubleml/datasets.py b/doubleml/datasets.py
index b510f4b6d..cfbebdd99 100644
--- a/doubleml/datasets.py
+++ b/doubleml/datasets.py
@@ -1433,3 +1433,63 @@ def make_ssm_data(n_obs=8000, dim_x=100, theta=1, mar=True, return_type='DoubleM
             return DoubleMLData(data, 'y', 'd', x_cols, 'z', None, 's')
     else:
         raise ValueError('Invalid return_type.')
+
+
+def make_irm_data_discrete_treatements(n_obs=200, p=10, support_size=5, n_levels=3, random_state=42):
+    """
+    Generates data from a interactive regression (IRM) model with multiple treatment levels.
+    """
+
+    np.random.seed(random_state)
+
+    # define continous treatment effect
+    def treatment_effect(x):
+        return np.exp(2 * x[:, 0]) + 3 * np.sin(4 * x[:, 0])
+
+    # Outcome support and coefficients
+    support_y = np.random.choice(np.arange(p), size=support_size, replace=False)
+    coefs_y = np.random.uniform(0, 1, size=support_size)
+    # treatment support and coefficients
+    support_d = support_y
+    range_coefs_d = [0.2, 0.3]
+    coefs_d = np.random.uniform(range_coefs_d[0], range_coefs_d[1], size=support_size)
+
+    # noise
+    epsilon = np.random.uniform(-1, 1, size=n_obs)
+
+    # Generate controls, covariates, treatments and outcomes
+    x = np.random.uniform(0, 1, size=(n_obs, p))
+    # Heterogeneous treatment effects
+    te = treatment_effect(x)
+
+    # set d to be a discrete number of levels
+    range_cont_d = support_size * range_coefs_d
+    # devide the range into n_levels
+    levels = np.linspace(range_cont_d[0], range_cont_d[1], n_levels - 1)
+
+    # define a discrete treatment version (with a baseline probability)
+    eta = np.random.uniform(0, 1, size=n_obs)
+    potential_level = sum([1.0 * (np.dot(x[:, support_d], coefs_d) >= level) for level in levels]) + 1
+    d = 1.0 * (eta >= 1/n_levels) * potential_level
+
+    # only treated for d > 0 compared to the baseline
+    y = te * (d > 0) + np.dot(x[:, support_y], coefs_y) + epsilon
+
+    oracle_values = {
+        'levels': levels,
+        'support_y': support_y,
+        'coefs_y': coefs_y,
+        'support_d': support_d,
+        'coefs_d': coefs_d,
+        'te': te,
+        'treatment_effect': treatment_effect
+    }
+
+    resul_dict = {
+        'x': x,
+        'y': y,
+        'd': d,
+        'oracle_values': oracle_values
+    }
+
+    return resul_dict

From 8728413fe54211947523cb0850defc1ecdfd0128 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Tue, 18 Jun 2024 16:18:03 +0200
Subject: [PATCH 03/98] first apo model

---
 doubleml/__init__.py |   2 +
 doubleml/irm/apo.py  | 157 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 159 insertions(+)
 create mode 100644 doubleml/irm/apo.py

diff --git a/doubleml/__init__.py b/doubleml/__init__.py
index 700991c79..69e064a00 100644
--- a/doubleml/__init__.py
+++ b/doubleml/__init__.py
@@ -5,6 +5,7 @@
 from .plm.plr import DoubleMLPLR
 from .plm.pliv import DoubleMLPLIV
 from .irm.irm import DoubleMLIRM
+from .irm.apo import DoubleMLAPO
 from .irm.iivm import DoubleMLIIVM
 from .double_ml_data import DoubleMLData, DoubleMLClusterData
 from .did.did import DoubleMLDID
@@ -23,6 +24,7 @@
            'DoubleMLPLR',
            'DoubleMLPLIV',
            'DoubleMLIRM',
+           'DoubleMLAPO',
            'DoubleMLIIVM',
            'DoubleMLData',
            'DoubleMLClusterData',
diff --git a/doubleml/irm/apo.py b/doubleml/irm/apo.py
new file mode 100644
index 000000000..ce1dc3ff3
--- /dev/null
+++ b/doubleml/irm/apo.py
@@ -0,0 +1,157 @@
+import numpy as np
+
+from ..double_ml import DoubleML
+
+from ..double_ml_score_mixins import LinearScoreMixin
+from ..double_ml_data import DoubleMLData
+
+from ..utils._checks import _check_score, _check_trimming, _check_weights
+
+
+class DoubleMLAPO(LinearScoreMixin, DoubleML):
+    """Double machine learning average potential outcomes for interactive regression models
+
+    Parameters
+    """
+    def __init__(self,
+                 obj_dml_data,
+                 ml_g,
+                 ml_m,
+                 treatment_level,
+                 n_folds=5,
+                 n_rep=1,
+                 score='APO',
+                 weights=None,
+                 normalize_ipw=False,
+                 trimming_rule='truncate',
+                 trimming_threshold=1e-2,
+                 draw_sample_splitting=True):
+        super().__init__(obj_dml_data,
+                         n_folds,
+                         n_rep,
+                         score,
+                         draw_sample_splitting)
+
+        # set up treatment level and check data
+        self._treatment_level = treatment_level
+        self._treated = self._dml_data.d == self._treatment_level
+
+        self._check_data(self._dml_data)
+        valid_scores = ['APO']
+        _check_score(self.score, valid_scores, allow_callable=False)
+
+        # set stratication for resampling
+        self._strata = self._dml_data.d
+        if draw_sample_splitting:
+            self.draw_sample_splitting()
+
+        ml_g_is_classifier = self._check_learner(ml_g, 'ml_g', regressor=True, classifier=True)
+        _ = self._check_learner(ml_m, 'ml_m', regressor=False, classifier=True)
+        self._learner = {'ml_g': ml_g, 'ml_m': ml_m}
+        self._normalize_ipw = normalize_ipw
+        if ml_g_is_classifier:
+            if obj_dml_data.binary_outcome:
+                self._predict_method = {'ml_g': 'predict_proba', 'ml_m': 'predict_proba'}
+            else:
+                raise ValueError(f'The ml_g learner {str(ml_g)} was identified as classifier '
+                                 'but the outcome variable is not binary with values 0 and 1.')
+        else:
+            self._predict_method = {'ml_g': 'predict', 'ml_m': 'predict_proba'}
+        self._initialize_ml_nuisance_params()
+
+        self._normalize_ipw = normalize_ipw
+        if not isinstance(self.normalize_ipw, bool):
+            raise TypeError('Normalization indicator has to be boolean. ' +
+                            f'Object of type {str(type(self.normalize_ipw))} passed.')
+        self._trimming_rule = trimming_rule
+        self._trimming_threshold = trimming_threshold
+        _check_trimming(self._trimming_rule, self._trimming_threshold)
+
+        self._sensitivity_implemented = True
+        self._external_predictions_implemented = True
+
+        # ATE weights are the standard case
+        _check_weights(weights, score="ATE", n_obs=obj_dml_data.n_obs, n_rep=self.n_rep)
+        self._initialize_weights(weights)
+
+        return self
+
+    @property
+    def treatment_level(self):
+        """
+        Chosen treatment level for average potential outcomes.
+        """
+        return self._treatment_level
+
+    @property
+    def treated(self):
+        """
+        Indicator for treated observations (with the corresponding treatment level).
+        """
+        return self._treated
+
+    @property
+    def normalize_ipw(self):
+        """
+        Indicates whether the inverse probability weights are normalized.
+        """
+        return self._normalize_ipw
+
+    @property
+    def trimming_rule(self):
+        """
+        Specifies the used trimming rule.
+        """
+        return self._trimming_rule
+
+    @property
+    def trimming_threshold(self):
+        """
+        Specifies the used trimming threshold.
+        """
+        return self._trimming_threshold
+
+    @property
+    def weights(self):
+        """
+        Specifies the weights for a weighted average potential outcome.
+        """
+        return self._weights
+
+    def _initialize_ml_nuisance_params(self):
+        valid_learner = ['ml_g', 'ml_m']
+        self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols}
+                        for learner in valid_learner}
+
+    def _nuisance_est(self):
+        # Estimate nuisance parameters
+        # This is a placeholder for the estimation logic
+        print("Estimating nuisance parameters...")
+
+    def _nuisance_tuning(self):
+        # Tune nuisance parameters
+        # This is a placeholder for tuning logic
+        print("Tuning nuisance parameters...")
+
+    def _sensitivity_element_est(self):
+        # Estimate sensitivity elements
+        # This is a placeholder for sensitivity estimation logic
+        print("Estimating sensitivity elements...")
+
+    def _check_data(self, obj_dml_data):
+        if not isinstance(obj_dml_data, DoubleMLData):
+            raise TypeError('The data must be of DoubleMLData type. '
+                            f'{str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed.')
+        if obj_dml_data.z_cols is not None:
+            raise ValueError('Incompatible data. ' +
+                             ' and '.join(obj_dml_data.z_cols) +
+                             ' have been set as instrumental variable(s).')
+
+        # check if treatment level is valid
+        if np.sum(self.treated) < 5:
+            raise ValueError(
+                'The number of treated observations is less than 5. ' +
+                f'Number of treated observations: {np.sum(self.treated)} for treatment level {self.treatment_level}.'
+            )
+
+        return

From 545de586aeb0349386754bbf00ffd1cee6b6e97f Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Tue, 18 Jun 2024 16:18:09 +0200
Subject: [PATCH 04/98] Create test_apo_exceptions.py

---
 doubleml/irm/tests/test_apo_exceptions.py | 122 ++++++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100644 doubleml/irm/tests/test_apo_exceptions.py

diff --git a/doubleml/irm/tests/test_apo_exceptions.py b/doubleml/irm/tests/test_apo_exceptions.py
new file mode 100644
index 000000000..cf5227957
--- /dev/null
+++ b/doubleml/irm/tests/test_apo_exceptions.py
@@ -0,0 +1,122 @@
+import pytest
+import pandas as pd
+import numpy as np
+
+from doubleml import DoubleMLAPO, DoubleMLData
+from doubleml.datasets import make_irm_data_discrete_treatements, make_iivm_data
+
+from sklearn.linear_model import Lasso, LogisticRegression
+
+n = 100
+data_apo = make_irm_data_discrete_treatements(n_obs=n)
+df_apo = pd.DataFrame(np.column_stack((data_apo['y'], data_apo['d'], data_apo['x'])),
+                      columns=['y', 'd'] + ['x' + str(i) for i in range(data_apo['x'].shape[1])])
+
+dml_data = DoubleMLData(df_apo, 'y', 'd')
+
+ml_g = Lasso()
+ml_m = LogisticRegression()
+
+
+@pytest.mark.ci
+def test_apo_exception_data():
+    msg = 'The data must be of DoubleMLData or DoubleMLClusterData type.'
+    with pytest.raises(TypeError, match=msg):
+        _ = DoubleMLAPO(pd.DataFrame(), ml_g, ml_m, treatment_level=0)
+
+    dml_data_z = make_iivm_data()
+    msg = r'Incompatible data. z have been set as instrumental variable\(s\).'
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLAPO(dml_data_z, ml_g, ml_m, treatment_level=0)
+
+    msg = 'The number of treated observations is less than 5. Number of treated observations: 0 for treatment level 1.1.'
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLAPO(dml_data, ml_g, ml_m, treatment_level=1.1)
+
+
+@pytest.mark.ci
+def test_apo_exception_scores():
+    msg = 'Invalid score MAR. Valid score APO.'
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLAPO(dml_data, ml_g, ml_m, treatment_level=0, score='MAR')
+
+
+@pytest.mark.ci
+def test_apo_exception_trimming_rule():
+    msg = 'Invalid trimming_rule discard. Valid trimming_rule truncate.'
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLAPO(dml_data, ml_g, ml_m, treatment_level=0, trimming_rule='discard')
+
+    # check the trimming_threshold exceptions
+    msg = "trimming_threshold has to be a float. Object of type <class 'str'> passed."
+    with pytest.raises(TypeError, match=msg):
+        _ = DoubleMLAPO(dml_data, ml_g, ml_m, treatment_level=0,
+                        trimming_rule='truncate', trimming_threshold="0.1")
+
+    msg = 'Invalid trimming_threshold 0.6. trimming_threshold has to be between 0 and 0.5.'
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLAPO(dml_data, ml_g, ml_m, treatment_level=0,
+                        trimming_rule='truncate', trimming_threshold=0.6)
+
+
+@pytest.mark.ci
+def test_apo_exception_ipw_normalization():
+    msg = "Normalization indicator has to be boolean. Object of type <class 'int'> passed."
+    with pytest.raises(TypeError, match=msg):
+        _ = DoubleMLAPO(dml_data, ml_g, ml_m, treatment_level=0, normalize_ipw=1)
+
+
+@pytest.mark.ci
+def test_apo_exception_weights():
+    msg = "weights must be a numpy array or dictionary. weights of type <class 'int'> was passed."
+    with pytest.raises(TypeError, match=msg):
+        _ = DoubleMLAPO(dml_data, ml_g, ml_m, treatment_level=0, weights=1)
+    msg = r"weights must have keys \['weights', 'weights_bar'\]. keys dict_keys\(\['d'\]\) were passed."
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLAPO(dml_data, ml_g, ml_m, treatment_level=0, weights={'d': [1, 2, 3]})
+
+    # shape checks
+    msg = rf"weights must have shape \({n},\). weights of shape \(1,\) was passed."
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLAPO(dml_data, ml_g, ml_m, treatment_level=0, weights=np.ones(1))
+    msg = rf"weights must have shape \({n},\). weights of shape \({n}, 2\) was passed."
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLAPO(dml_data, ml_g, ml_m, treatment_level=0, weights=np.ones((n, 2)))
+
+    msg = rf"weights must have shape \({n},\). weights of shape \(1,\) was passed."
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLAPO(dml_data, ml_g, ml_m, treatment_level=0,
+                        weights={'weights': np.ones(1), 'weights_bar': np.ones(1)})
+    msg = rf"weights must have shape \({n},\). weights of shape \({n}, 2\) was passed."
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLAPO(dml_data, ml_g, ml_m, treatment_level=0,
+                        weights={'weights': np.ones((n, 2)), 'weights_bar': np.ones((n, 2))})
+    msg = rf"weights_bar must have shape \({n}, 1\). weights_bar of shape \({n}, 2\) was passed."
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLAPO(dml_data, ml_g, ml_m, treatment_level=0,
+                        weights={'weights': np.ones(n), 'weights_bar': np.ones((n, 2))})
+
+    # value checks
+    msg = "All weights values must be greater or equal 0."
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLAPO(dml_data, ml_g, ml_m, treatment_level=0,
+                        weights=-1*np.ones(n,))
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLAPO(dml_data, ml_g, ml_m, treatment_level=0,
+                        weights={'weights': -1*np.ones(n,), 'weights_bar': np.ones((n, 1))})
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLAPO(dml_data, ml_g, ml_m, treatment_level=0,
+                        weights={'weights': np.ones(n,), 'weights_bar': -1*np.ones((n, 1))})
+
+    msg = "At least one weight must be non-zero."
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLAPO(dml_data, ml_g, ml_m, treatment_level=0,
+                        weights=np.zeros((dml_data.d.shape[0], )))
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLAPO(dml_data, ml_g, ml_m, treatment_level=0,
+                        weights={'weights': np.zeros((dml_data.d.shape[0], )),
+                                 'weights_bar': np.ones((dml_data.d.shape[0], 1))})
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLAPO(dml_data, ml_g, ml_m, treatment_level=0,
+                        weights={'weights': np.ones((dml_data.d.shape[0], )),
+                                 'weights_bar': np.zeros((dml_data.d.shape[0], 1))})

From 4eaaa526f9aaf7f65aea2e925c1aac06959f6220 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Tue, 18 Jun 2024 17:03:38 +0200
Subject: [PATCH 05/98] update irm sensitivity atte

---
 doubleml/irm/irm.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/doubleml/irm/irm.py b/doubleml/irm/irm.py
index eface6917..0fa6ec749 100644
--- a/doubleml/irm/irm.py
+++ b/doubleml/irm/irm.py
@@ -376,11 +376,7 @@ def _sensitivity_element_est(self, preds):
 
         m_hat = preds['predictions']['ml_m']
         g_hat0 = preds['predictions']['ml_g0']
-        if self.score == 'ATE':
-            g_hat1 = preds['predictions']['ml_g1']
-        else:
-            assert self.score == 'ATTE'
-            g_hat1 = y
+        g_hat1 = preds['predictions']['ml_g1']
 
         # use weights make this extendable
         weights, weights_bar = self._get_weights(m_hat=m_hat)

From 825aa0c387a1b50fbd1e1d137485c8f6bb866120 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Tue, 18 Jun 2024 17:11:42 +0200
Subject: [PATCH 06/98] Update _utils_irm_manual.py

---
 doubleml/irm/tests/_utils_irm_manual.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/doubleml/irm/tests/_utils_irm_manual.py b/doubleml/irm/tests/_utils_irm_manual.py
index c249c9cf9..5fbdd174c 100644
--- a/doubleml/irm/tests/_utils_irm_manual.py
+++ b/doubleml/irm/tests/_utils_irm_manual.py
@@ -248,11 +248,7 @@ def fit_sensitivity_elements_irm(y, d, all_coef, predictions, score, n_rep):
 
         m_hat = predictions['ml_m'][:, i_rep, 0]
         g_hat0 = predictions['ml_g0'][:, i_rep, 0]
-        if score == 'ATE':
-            g_hat1 = predictions['ml_g1'][:, i_rep, 0]
-        else:
-            assert score == 'ATTE'
-            g_hat1 = y
+        g_hat1 = predictions['ml_g1'][:, i_rep, 0]
 
         if score == 'ATE':
             weights = np.ones_like(d)

From 7c3f1c1553bca0b629afed637c7afe5915817a5f Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Tue, 18 Jun 2024 17:14:04 +0200
Subject: [PATCH 07/98] add estimation and sensitivity to apo model

---
 doubleml/irm/apo.py | 175 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 161 insertions(+), 14 deletions(-)

diff --git a/doubleml/irm/apo.py b/doubleml/irm/apo.py
index ce1dc3ff3..cfd8d60f1 100644
--- a/doubleml/irm/apo.py
+++ b/doubleml/irm/apo.py
@@ -1,11 +1,17 @@
 import numpy as np
 
+from sklearn.utils import check_X_y
+from sklearn.utils.multiclass import type_of_target
+
 from ..double_ml import DoubleML
 
 from ..double_ml_score_mixins import LinearScoreMixin
 from ..double_ml_data import DoubleMLData
 
-from ..utils._checks import _check_score, _check_trimming, _check_weights
+from ..utils._estimation import _dml_cv_predict, _get_cond_smpls, _cond_targets, _trimm, \
+    _normalize_ipw
+from ..utils._checks import _check_score, _check_trimming, _check_weights, _check_finite_predictions, \
+    _check_is_propensity
 
 
 class DoubleMLAPO(LinearScoreMixin, DoubleML):
@@ -48,7 +54,6 @@ def __init__(self,
         ml_g_is_classifier = self._check_learner(ml_g, 'ml_g', regressor=True, classifier=True)
         _ = self._check_learner(ml_m, 'ml_m', regressor=False, classifier=True)
         self._learner = {'ml_g': ml_g, 'ml_m': ml_m}
-        self._normalize_ipw = normalize_ipw
         if ml_g_is_classifier:
             if obj_dml_data.binary_outcome:
                 self._predict_method = {'ml_g': 'predict_proba', 'ml_m': 'predict_proba'}
@@ -74,8 +79,6 @@ def __init__(self,
         _check_weights(weights, score="ATE", n_obs=obj_dml_data.n_obs, n_rep=self.n_rep)
         self._initialize_weights(weights)
 
-        return self
-
     @property
     def treatment_level(self):
         """
@@ -119,25 +122,169 @@ def weights(self):
         return self._weights
 
     def _initialize_ml_nuisance_params(self):
-        valid_learner = ['ml_g', 'ml_m']
+        valid_learner = ['ml_g0', 'ml_g1', 'ml_m']
         self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols}
                         for learner in valid_learner}
 
-    def _nuisance_est(self):
-        # Estimate nuisance parameters
-        # This is a placeholder for the estimation logic
-        print("Estimating nuisance parameters...")
+    def _initialize_weights(self, weights):
+        if weights is None:
+            weights = np.ones(self._dml_data.n_obs)
+        if isinstance(weights, np.ndarray):
+            self._weights = {'weights': weights}
+        else:
+            assert isinstance(weights, dict)
+            self._weights = weights
+
+    def _get_weights(self, m_hat=None):
+        # standard case for APO/ATE
+        weights = self._weights['weights']
+        if 'weights_bar' not in self._weights.keys():
+            weights_bar = self._weights['weights']
+        else:
+            weights_bar = self._weights['weights_bar'][:, self._i_rep]
+
+        return weights, weights_bar
+
+    def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False):
+        x, y = check_X_y(self._dml_data.x, self._dml_data.y,
+                         force_all_finite=False)
+        # use the treated indicator to get the correct sample splits
+        x, d = check_X_y(x, self.treated,
+                         force_all_finite=False)
+
+        # get train indices for d == treatment_level
+        smpls_d0, smpls_d1 = _get_cond_smpls(smpls, d)
+        g0_external = external_predictions['ml_g0'] is not None
+        g1_external = external_predictions['ml_g1'] is not None
+        m_external = external_predictions['ml_m'] is not None
+
+        # nuisance g (g0 only relevant for sensitivity analysis)
+        if g0_external:
+            # use external predictions
+            g_hat0 = {'preds': external_predictions['ml_g0'],
+                      'targets': None,
+                      'models': None}
+        else:
+            g_hat0 = _dml_cv_predict(self._learner['ml_g'], x, y, smpls=smpls_d0, n_jobs=n_jobs_cv,
+                                     est_params=self._get_params('ml_g0'), method=self._predict_method['ml_g'],
+                                     return_models=return_models)
+            _check_finite_predictions(g_hat0['preds'], self._learner['ml_g'], 'ml_g', smpls)
+            g_hat0['targets'] = _cond_targets(g_hat0['targets'], cond_sample=(d == 0))
+
+        if self._dml_data.binary_outcome:
+            binary_preds = (type_of_target(g_hat0['preds']) == 'binary')
+            zero_one_preds = np.all((np.power(g_hat0['preds'], 2) - g_hat0['preds']) == 0)
+            if binary_preds & zero_one_preds:
+                raise ValueError(f'For the binary outcome variable {self._dml_data.y_col}, '
+                                 f'predictions obtained with the ml_g learner {str(self._learner["ml_g"])} are also '
+                                 'observed to be binary with values 0 and 1. Make sure that for classifiers '
+                                 'probabilities and not labels are predicted.')
+
+        if g1_external:
+            # use external predictions
+            g_hat1 = {'preds': external_predictions['ml_g1'],
+                      'targets': None,
+                      'models': None}
+        else:
+            g_hat1 = _dml_cv_predict(self._learner['ml_g'], x, y, smpls=smpls_d1, n_jobs=n_jobs_cv,
+                                     est_params=self._get_params('ml_g1'), method=self._predict_method['ml_g'],
+                                     return_models=return_models)
+            _check_finite_predictions(g_hat1['preds'], self._learner['ml_g'], 'ml_g', smpls)
+            # adjust target values to consider only compatible subsamples
+            g_hat1['targets'] = _cond_targets(g_hat1['targets'], cond_sample=(d == 1))
+
+        if self._dml_data.binary_outcome:
+            binary_preds = (type_of_target(g_hat1['preds']) == 'binary')
+            zero_one_preds = np.all((np.power(g_hat1['preds'], 2) - g_hat1['preds']) == 0)
+            if binary_preds & zero_one_preds:
+                raise ValueError(f'For the binary outcome variable {self._dml_data.y_col}, '
+                                 f'predictions obtained with the ml_g learner {str(self._learner["ml_g"])} are also '
+                                 'observed to be binary with values 0 and 1. Make sure that for classifiers '
+                                 'probabilities and not labels are predicted.')
+
+        # nuisance m
+        if m_external:
+            # use external predictions
+            m_hat = {'preds': external_predictions['ml_m'],
+                     'targets': None,
+                     'models': None}
+        else:
+            m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=smpls, n_jobs=n_jobs_cv,
+                                    est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'],
+                                    return_models=return_models)
+            _check_finite_predictions(m_hat['preds'], self._learner['ml_m'], 'ml_m', smpls)
+            _check_is_propensity(m_hat['preds'], self._learner['ml_m'], 'ml_m', smpls, eps=1e-12)
+
+        # also trimm external predictions
+        m_hat['preds'] = _trimm(m_hat['preds'], self.trimming_rule, self.trimming_threshold)
+
+        psi_a, psi_b = self._score_elements(y, d, g_hat0['preds'], g_hat1['preds'],
+                                            m_hat['preds'], smpls)
+        psi_elements = {'psi_a': psi_a,
+                        'psi_b': psi_b}
+
+        preds = {'predictions': {'ml_g0': g_hat0['preds'],
+                                 'ml_g1': g_hat1['preds'],
+                                 'ml_m': m_hat['preds']},
+                 'targets': {'ml_g0': g_hat0['targets'],
+                             'ml_g1': g_hat1['targets'],
+                             'ml_m': m_hat['targets']},
+                 'models': {'ml_g0': g_hat0['models'],
+                            'ml_g1': g_hat1['models'],
+                            'ml_m': m_hat['models']}
+                 }
+        return psi_elements, preds
+
+    def _score_elements(self, y, d, g_hat0, g_hat1, m_hat, smpls):
+        m_hat_adj = np.full_like(m_hat, np.nan, dtype='float64')
+        if self.normalize_ipw:
+            m_hat_adj = _normalize_ipw(m_hat, d)
+        else:
+            m_hat_adj = m_hat
+
+        u_hat = y - g_hat1
+        weights, weights_bar = self._get_weights(m_hat=m_hat_adj)
+        psi_b = weights * g_hat1 + weights_bar * np.divide(np.multiply(d, u_hat), m_hat_adj)
+        psi_a = np.full_like(m_hat_adj, -1.0)
+
+        return psi_a, psi_b
+
+    def _sensitivity_element_est(self, preds):
+        # set elments for readability
+        y = self._dml_data.y
+        d = self.treated
+
+        m_hat = preds['predictions']['ml_m']
+        g_hat0 = preds['predictions']['ml_g0']
+        g_hat1 = preds['predictions']['ml_g1']
+
+        weights, weights_bar = self._get_weights(m_hat=m_hat)
+
+        sigma2_score_element = np.square(y - np.multiply(d, g_hat1) - np.multiply(1.0-d, g_hat0))
+        sigma2 = np.mean(sigma2_score_element)
+        psi_sigma2 = sigma2_score_element - sigma2
+
+        # calc m(W,alpha) and Riesz representer
+        m_alpha = np.multiply(weights, np.multiply(weights_bar, np.divide(1.0, m_hat)))
+        rr = np.multiply(weights_bar, np.divide(d, m_hat))
+
+        nu2_score_element = np.multiply(2.0, m_alpha) - np.square(rr)
+        nu2 = np.mean(nu2_score_element)
+        psi_nu2 = nu2_score_element - nu2
+
+        element_dict = {'sigma2': sigma2,
+                        'nu2': nu2,
+                        'psi_sigma2': psi_sigma2,
+                        'psi_nu2': psi_nu2,
+                        'riesz_rep': rr,
+                        }
+        return element_dict
 
     def _nuisance_tuning(self):
         # Tune nuisance parameters
         # This is a placeholder for tuning logic
         print("Tuning nuisance parameters...")
 
-    def _sensitivity_element_est(self):
-        # Estimate sensitivity elements
-        # This is a placeholder for sensitivity estimation logic
-        print("Estimating sensitivity elements...")
-
     def _check_data(self, obj_dml_data):
         if not isinstance(obj_dml_data, DoubleMLData):
             raise TypeError('The data must be of DoubleMLData type. '

From 44d3100eefc9b6ca63ff9f5e045cc5cf55fb3932 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Tue, 18 Jun 2024 18:19:25 +0200
Subject: [PATCH 08/98] rename estimation

---
 doubleml/irm/apo.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/doubleml/irm/apo.py b/doubleml/irm/apo.py
index cfd8d60f1..4c6298b9a 100644
--- a/doubleml/irm/apo.py
+++ b/doubleml/irm/apo.py
@@ -149,11 +149,11 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
         x, y = check_X_y(self._dml_data.x, self._dml_data.y,
                          force_all_finite=False)
         # use the treated indicator to get the correct sample splits
-        x, d = check_X_y(x, self.treated,
-                         force_all_finite=False)
+        x, treated = check_X_y(x, self.treated,
+                               force_all_finite=False)
 
         # get train indices for d == treatment_level
-        smpls_d0, smpls_d1 = _get_cond_smpls(smpls, d)
+        smpls_d0, smpls_d1 = _get_cond_smpls(smpls, treated)
         g0_external = external_predictions['ml_g0'] is not None
         g1_external = external_predictions['ml_g1'] is not None
         m_external = external_predictions['ml_m'] is not None
@@ -169,7 +169,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
                                      est_params=self._get_params('ml_g0'), method=self._predict_method['ml_g'],
                                      return_models=return_models)
             _check_finite_predictions(g_hat0['preds'], self._learner['ml_g'], 'ml_g', smpls)
-            g_hat0['targets'] = _cond_targets(g_hat0['targets'], cond_sample=(d == 0))
+            g_hat0['targets'] = _cond_targets(g_hat0['targets'], cond_sample=(treated == 0))
 
         if self._dml_data.binary_outcome:
             binary_preds = (type_of_target(g_hat0['preds']) == 'binary')
@@ -191,7 +191,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
                                      return_models=return_models)
             _check_finite_predictions(g_hat1['preds'], self._learner['ml_g'], 'ml_g', smpls)
             # adjust target values to consider only compatible subsamples
-            g_hat1['targets'] = _cond_targets(g_hat1['targets'], cond_sample=(d == 1))
+            g_hat1['targets'] = _cond_targets(g_hat1['targets'], cond_sample=(treated == 1))
 
         if self._dml_data.binary_outcome:
             binary_preds = (type_of_target(g_hat1['preds']) == 'binary')
@@ -209,7 +209,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
                      'targets': None,
                      'models': None}
         else:
-            m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=smpls, n_jobs=n_jobs_cv,
+            m_hat = _dml_cv_predict(self._learner['ml_m'], x, treated, smpls=smpls, n_jobs=n_jobs_cv,
                                     est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'],
                                     return_models=return_models)
             _check_finite_predictions(m_hat['preds'], self._learner['ml_m'], 'ml_m', smpls)
@@ -218,7 +218,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
         # also trimm external predictions
         m_hat['preds'] = _trimm(m_hat['preds'], self.trimming_rule, self.trimming_threshold)
 
-        psi_a, psi_b = self._score_elements(y, d, g_hat0['preds'], g_hat1['preds'],
+        psi_a, psi_b = self._score_elements(y, treated, g_hat0['preds'], g_hat1['preds'],
                                             m_hat['preds'], smpls)
         psi_elements = {'psi_a': psi_a,
                         'psi_b': psi_b}
@@ -235,16 +235,16 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
                  }
         return psi_elements, preds
 
-    def _score_elements(self, y, d, g_hat0, g_hat1, m_hat, smpls):
+    def _score_elements(self, y, treated, g_hat0, g_hat1, m_hat, smpls):
         m_hat_adj = np.full_like(m_hat, np.nan, dtype='float64')
         if self.normalize_ipw:
-            m_hat_adj = _normalize_ipw(m_hat, d)
+            m_hat_adj = _normalize_ipw(m_hat, treated)
         else:
             m_hat_adj = m_hat
 
         u_hat = y - g_hat1
         weights, weights_bar = self._get_weights(m_hat=m_hat_adj)
-        psi_b = weights * g_hat1 + weights_bar * np.divide(np.multiply(d, u_hat), m_hat_adj)
+        psi_b = weights * g_hat1 + weights_bar * np.divide(np.multiply(treated, u_hat), m_hat_adj)
         psi_a = np.full_like(m_hat_adj, -1.0)
 
         return psi_a, psi_b
@@ -252,7 +252,7 @@ def _score_elements(self, y, d, g_hat0, g_hat1, m_hat, smpls):
     def _sensitivity_element_est(self, preds):
         # set elments for readability
         y = self._dml_data.y
-        d = self.treated
+        treated = self.treated
 
         m_hat = preds['predictions']['ml_m']
         g_hat0 = preds['predictions']['ml_g0']
@@ -260,13 +260,13 @@ def _sensitivity_element_est(self, preds):
 
         weights, weights_bar = self._get_weights(m_hat=m_hat)
 
-        sigma2_score_element = np.square(y - np.multiply(d, g_hat1) - np.multiply(1.0-d, g_hat0))
+        sigma2_score_element = np.square(y - np.multiply(treated, g_hat1) - np.multiply(1.0-treated, g_hat0))
         sigma2 = np.mean(sigma2_score_element)
         psi_sigma2 = sigma2_score_element - sigma2
 
         # calc m(W,alpha) and Riesz representer
         m_alpha = np.multiply(weights, np.multiply(weights_bar, np.divide(1.0, m_hat)))
-        rr = np.multiply(weights_bar, np.divide(d, m_hat))
+        rr = np.multiply(weights_bar, np.divide(treated, m_hat))
 
         nu2_score_element = np.multiply(2.0, m_alpha) - np.square(rr)
         nu2 = np.mean(nu2_score_element)

From 592e6420a730059b5f25d998b0d150afb6447545 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Tue, 18 Jun 2024 18:19:31 +0200
Subject: [PATCH 09/98] Create _utils_apo_manual.py

---
 doubleml/irm/tests/_utils_apo_manual.py | 125 ++++++++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 doubleml/irm/tests/_utils_apo_manual.py

diff --git a/doubleml/irm/tests/_utils_apo_manual.py b/doubleml/irm/tests/_utils_apo_manual.py
new file mode 100644
index 000000000..329f0830c
--- /dev/null
+++ b/doubleml/irm/tests/_utils_apo_manual.py
@@ -0,0 +1,125 @@
+import numpy as np
+from sklearn.base import clone, is_classifier
+
+from ...tests._utils_boot import boot_manual, draw_weights
+from ...tests._utils import fit_predict, fit_predict_proba, tune_grid_search
+
+from ...utils._estimation import _normalize_ipw
+from ...utils._checks import _check_is_propensity
+
+
+def fit_apo(y, x, d,
+            learner_g, learner_m, treatment_level, all_smpls, score,
+            n_rep=1, g0_params=None, g1_params=None, m_params=None,
+            normalize_ipw=False, trimming_threshold=1e-2):
+    n_obs = len(y)
+    treated = (d == treatment_level)
+
+    thetas = np.zeros(n_rep)
+    ses = np.zeros(n_rep)
+    all_g_hat0 = list()
+    all_g_hat1 = list()
+    all_m_hat = list()
+
+    for i_rep in range(n_rep):
+        smpls = all_smpls[i_rep]
+        g_hat0, g_hat1, m_hat = fit_nuisance_apo(y, x, d, treated,
+                                                 learner_g, learner_m, smpls, score,
+                                                 g0_params=g0_params, g1_params=g1_params, m_params=m_params,
+                                                 trimming_threshold=trimming_threshold)
+
+        all_g_hat0.append(g_hat0)
+        all_g_hat1.append(g_hat1)
+        all_m_hat.append(m_hat)
+
+        thetas[i_rep], ses[i_rep] = apo_dml2(y, x, d, treated,
+                                             g_hat0, g_hat1, m_hat,
+                                             smpls, score, normalize_ipw)
+
+    theta = np.median(thetas)
+    se = np.sqrt(np.median(np.power(ses, 2) * n_obs + np.power(thetas - theta, 2)) / n_obs)
+
+    res = {'theta': theta, 'se': se,
+           'thetas': thetas, 'ses': ses,
+           'all_g_hat0': all_g_hat0, 'all_g_hat1': all_g_hat1, 'all_m_hat': all_m_hat}
+
+    return res
+
+
+def fit_nuisance_apo(y, x, d, treated,
+                     learner_g, learner_m, smpls, score,
+                     g0_params=None, g1_params=None, m_params=None,
+                     trimming_threshold=1e-12):
+    ml_g0 = clone(learner_g)
+    ml_g1 = clone(learner_g)
+
+    train_cond0 = np.where(treated == 0)[0]
+    if is_classifier(learner_g):
+        g_hat0_list = fit_predict_proba(y, x, ml_g0, g0_params, smpls,
+                                        train_cond=train_cond0)
+    else:
+        g_hat0_list = fit_predict(y, x, ml_g0, g0_params, smpls,
+                                  train_cond=train_cond0)
+
+    train_cond1 = np.where(treated == 1)[0]
+    if is_classifier(learner_g):
+        g_hat1_list = fit_predict_proba(y, x, ml_g1, g1_params, smpls,
+                                        train_cond=train_cond1)
+    else:
+        g_hat1_list = fit_predict(y, x, ml_g1, g1_params, smpls,
+                                  train_cond=train_cond1)
+
+    ml_m = clone(learner_m)
+    m_hat_list = fit_predict_proba(treated, x, ml_m, m_params, smpls,
+                                   trimming_threshold=trimming_threshold)
+
+    return g_hat0_list, g_hat1_list, m_hat_list
+
+
+def compute_residuals(y, g_hat0_list, g_hat1_list, m_hat_list, smpls):
+    u_hat0 = np.full_like(y, np.nan, dtype='float64')
+    u_hat1 = np.full_like(y, np.nan, dtype='float64')
+    g_hat0 = np.full_like(y, np.nan, dtype='float64')
+    g_hat1 = np.full_like(y, np.nan, dtype='float64')
+    m_hat = np.full_like(y, np.nan, dtype='float64')
+    for idx, (_, test_index) in enumerate(smpls):
+        u_hat0[test_index] = y[test_index] - g_hat0_list[idx]
+        u_hat1[test_index] = y[test_index] - g_hat1_list[idx]
+        g_hat0[test_index] = g_hat0_list[idx]
+        g_hat1[test_index] = g_hat1_list[idx]
+        m_hat[test_index] = m_hat_list[idx]
+
+    _check_is_propensity(m_hat, 'learner_m', 'ml_m', smpls, eps=1e-12)
+    return u_hat0, u_hat1, g_hat0, g_hat1, m_hat
+
+
+def apo_dml2(y, x, d, treated, g_hat0_list, g_hat1_list, m_hat_list, smpls, score, normalize_ipw):
+    n_obs = len(y)
+    u_hat0, u_hat1, g_hat0, g_hat1, m_hat = compute_residuals(
+        y, g_hat0_list, g_hat1_list, m_hat_list, smpls
+    )
+
+    if normalize_ipw:
+        m_hat_adj = _normalize_ipw(m_hat, treated)
+    else:
+        m_hat_adj = m_hat
+
+    theta_hat = apo_orth(g_hat0, g_hat1, m_hat_adj,
+                         u_hat0, u_hat1, treated, score)
+
+    se = np.sqrt(var_apo(theta_hat, g_hat0, g_hat1,
+                         m_hat_adj,
+                         u_hat0, u_hat1,
+                         treated, score, n_obs))
+
+    return theta_hat, se
+
+
+def apo_orth(g_hat0, g_hat1, m_hat, p_hat, u_hat0, u_hat1, treated, score):
+    res = np.mean(g_hat1 + np.divide(np.multiply(treated, u_hat1), m_hat))
+    return res
+
+
+def var_apo(theta, g_hat0, g_hat1, m_hat, p_hat, u_hat0, u_hat1, treated, score, n_obs):
+    var = 1/n_obs * np.mean(np.power(g_hat1 + np.divide(np.multiply(treated, u_hat1), m_hat), 2))
+    return var

From fbc9e717b875d36d4447461d6114f56e8369fe0c Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Tue, 18 Jun 2024 19:27:46 +0200
Subject: [PATCH 10/98] first manual apo test

---
 doubleml/irm/tests/_utils_apo_manual.py |   4 +-
 doubleml/irm/tests/test_apo.py          | 103 ++++++++++++++++++++++++
 2 files changed, 105 insertions(+), 2 deletions(-)
 create mode 100644 doubleml/irm/tests/test_apo.py

diff --git a/doubleml/irm/tests/_utils_apo_manual.py b/doubleml/irm/tests/_utils_apo_manual.py
index 329f0830c..c7a9b4321 100644
--- a/doubleml/irm/tests/_utils_apo_manual.py
+++ b/doubleml/irm/tests/_utils_apo_manual.py
@@ -115,11 +115,11 @@ def apo_dml2(y, x, d, treated, g_hat0_list, g_hat1_list, m_hat_list, smpls, scor
     return theta_hat, se
 
 
-def apo_orth(g_hat0, g_hat1, m_hat, p_hat, u_hat0, u_hat1, treated, score):
+def apo_orth(g_hat0, g_hat1, m_hat, u_hat0, u_hat1, treated, score):
     res = np.mean(g_hat1 + np.divide(np.multiply(treated, u_hat1), m_hat))
     return res
 
 
-def var_apo(theta, g_hat0, g_hat1, m_hat, p_hat, u_hat0, u_hat1, treated, score, n_obs):
+def var_apo(theta, g_hat0, g_hat1, m_hat, u_hat0, u_hat1, treated, score, n_obs):
     var = 1/n_obs * np.mean(np.power(g_hat1 + np.divide(np.multiply(treated, u_hat1), m_hat), 2))
     return var
diff --git a/doubleml/irm/tests/test_apo.py b/doubleml/irm/tests/test_apo.py
new file mode 100644
index 000000000..939310d99
--- /dev/null
+++ b/doubleml/irm/tests/test_apo.py
@@ -0,0 +1,103 @@
+import numpy as np
+import pandas as pd
+import pytest
+import math
+
+from sklearn.base import clone
+
+from sklearn.linear_model import LogisticRegression, LinearRegression
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+
+import doubleml as dml
+from doubleml.datasets import make_irm_data_discrete_treatements
+from doubleml.utils.resampling import DoubleMLResampling
+
+from ...tests._utils import draw_smpls
+from ._utils_apo_manual import fit_apo
+
+
+@pytest.fixture(scope='module',
+                params=[[LinearRegression(),
+                         LogisticRegression(solver='lbfgs', max_iter=250)],
+                        [RandomForestRegressor(max_depth=5, n_estimators=10, random_state=42),
+                         RandomForestClassifier(max_depth=5, n_estimators=10, random_state=42)]])
+def learner(request):
+    return request.param
+
+
+@pytest.fixture(scope='module',
+                params=[False, True])
+def normalize_ipw(request):
+    return request.param
+
+
+@pytest.fixture(scope='module',
+                params=[0.2, 0.15])
+def trimming_threshold(request):
+    return request.param
+
+
+@pytest.fixture(scope='module')
+def dml_apo_fixture(generate_data_irm, learner, normalize_ipw, trimming_threshold):
+    boot_methods = ['normal']
+    n_folds = 2
+    n_rep_boot = 499
+
+
+    # Set machine learning methods for m & g
+    ml_g = clone(learner[0])
+    ml_m = clone(learner[1])
+
+    np.random.seed(3141)
+    n_obs = 100
+    data_apo = make_irm_data_discrete_treatements(n_obs=n_obs)
+    y = data_apo['y']
+    x = data_apo['x']
+    d = data_apo['d']
+    df_apo = pd.DataFrame(
+        np.column_stack((y, d, x)),
+        columns=['y', 'd'] + ['x' + str(i) for i in range(data_apo['x'].shape[1])]
+    )
+
+    dml_data = dml.DoubleMLData(df_apo, 'y', 'd')
+    all_smpls = draw_smpls(n_obs, n_folds, n_rep=1, groups=d)
+
+    np.random.seed(3141)
+    dml_obj = dml.DoubleMLAPO(dml_data,
+                                  ml_g, ml_m,
+                                  treatment_level=0,
+                                  n_folds=n_folds,
+                                  normalize_ipw=normalize_ipw,
+                                  draw_sample_splitting=False,
+                                  trimming_threshold=trimming_threshold)
+
+    # synchronize the sample splitting
+    dml_obj.set_sample_splitting(all_smpls=all_smpls)
+    dml_obj.fit()
+
+    np.random.seed(3141)
+    res_manual = fit_apo(y, x, d,
+                         clone(learner[0]), clone(learner[1]),
+                         treatment_level=0,
+                         all_smpls=all_smpls,
+                         score='APO',
+                         normalize_ipw=normalize_ipw,
+                         trimming_threshold=trimming_threshold)
+
+    res_dict = {'coef': dml_obj.coef,
+                'coef_manual': res_manual['theta'],
+                'coef_ext': dml_obj.coef_extern,
+                'se': dml_obj.se,
+                'se_manual': res_manual['se']}
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_dml_apo_coef(dml_apo_fixture):
+    assert math.isclose(dml_apo_fixture['coef'][0],
+                        dml_apo_fixture['coef_manual'],
+                        rel_tol=1e-9, abs_tol=1e-4)
+    assert math.isclose(dml_apo_fixture['coef'][0],
+                        dml_apo_fixture['coef_ext'][0],
+                        rel_tol=1e-9, abs_tol=1e-4)
\ No newline at end of file

From fab0ae2349414b8f096583ad18666e3ef616dfea Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Thu, 20 Jun 2024 11:54:21 +0200
Subject: [PATCH 11/98] add external prediction test and se unit test

---
 doubleml/irm/apo.py                     |  6 ++--
 doubleml/irm/tests/_utils_apo_manual.py |  2 +-
 doubleml/irm/tests/test_apo.py          | 40 ++++++++++++++++++++++---
 3 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/doubleml/irm/apo.py b/doubleml/irm/apo.py
index 4c6298b9a..d841475af 100644
--- a/doubleml/irm/apo.py
+++ b/doubleml/irm/apo.py
@@ -135,7 +135,7 @@ def _initialize_weights(self, weights):
             assert isinstance(weights, dict)
             self._weights = weights
 
-    def _get_weights(self, m_hat=None):
+    def _get_weights(self):
         # standard case for APO/ATE
         weights = self._weights['weights']
         if 'weights_bar' not in self._weights.keys():
@@ -243,7 +243,7 @@ def _score_elements(self, y, treated, g_hat0, g_hat1, m_hat, smpls):
             m_hat_adj = m_hat
 
         u_hat = y - g_hat1
-        weights, weights_bar = self._get_weights(m_hat=m_hat_adj)
+        weights, weights_bar = self._get_weights()
         psi_b = weights * g_hat1 + weights_bar * np.divide(np.multiply(treated, u_hat), m_hat_adj)
         psi_a = np.full_like(m_hat_adj, -1.0)
 
@@ -258,7 +258,7 @@ def _sensitivity_element_est(self, preds):
         g_hat0 = preds['predictions']['ml_g0']
         g_hat1 = preds['predictions']['ml_g1']
 
-        weights, weights_bar = self._get_weights(m_hat=m_hat)
+        weights, weights_bar = self._get_weights()
 
         sigma2_score_element = np.square(y - np.multiply(treated, g_hat1) - np.multiply(1.0-treated, g_hat0))
         sigma2 = np.mean(sigma2_score_element)
diff --git a/doubleml/irm/tests/_utils_apo_manual.py b/doubleml/irm/tests/_utils_apo_manual.py
index c7a9b4321..38bb0870c 100644
--- a/doubleml/irm/tests/_utils_apo_manual.py
+++ b/doubleml/irm/tests/_utils_apo_manual.py
@@ -121,5 +121,5 @@ def apo_orth(g_hat0, g_hat1, m_hat, u_hat0, u_hat1, treated, score):
 
 
 def var_apo(theta, g_hat0, g_hat1, m_hat, u_hat0, u_hat1, treated, score, n_obs):
-    var = 1/n_obs * np.mean(np.power(g_hat1 + np.divide(np.multiply(treated, u_hat1), m_hat), 2))
+    var = 1/n_obs * np.mean(np.power(g_hat1 + np.divide(np.multiply(treated, u_hat1), m_hat) - theta, 2))
     return var
diff --git a/doubleml/irm/tests/test_apo.py b/doubleml/irm/tests/test_apo.py
index 939310d99..1f96a5d51 100644
--- a/doubleml/irm/tests/test_apo.py
+++ b/doubleml/irm/tests/test_apo.py
@@ -18,7 +18,7 @@
 
 @pytest.fixture(scope='module',
                 params=[[LinearRegression(),
-                         LogisticRegression(solver='lbfgs', max_iter=250)],
+                         LogisticRegression(solver='lbfgs', max_iter=250, random_state=42)],
                         [RandomForestRegressor(max_depth=5, n_estimators=10, random_state=42),
                          RandomForestClassifier(max_depth=5, n_estimators=10, random_state=42)]])
 def learner(request):
@@ -67,6 +67,7 @@ def dml_apo_fixture(generate_data_irm, learner, normalize_ipw, trimming_threshol
                                   ml_g, ml_m,
                                   treatment_level=0,
                                   n_folds=n_folds,
+                                  score='APO',
                                   normalize_ipw=normalize_ipw,
                                   draw_sample_splitting=False,
                                   trimming_threshold=trimming_threshold)
@@ -84,11 +85,32 @@ def dml_apo_fixture(generate_data_irm, learner, normalize_ipw, trimming_threshol
                          normalize_ipw=normalize_ipw,
                          trimming_threshold=trimming_threshold)
 
+    np.random.seed(3141)
+    # test with external nuisance predictions
+    dml_obj_ext = dml.DoubleMLAPO(dml_data,
+                                  ml_g, ml_m,
+                                  treatment_level=0,
+                                  n_folds=n_folds,
+                                  score='APO',
+                                  normalize_ipw=normalize_ipw,
+                                  draw_sample_splitting=False,
+                                  trimming_threshold=trimming_threshold)
+
+    # synchronize the sample splitting
+    dml_obj_ext.set_sample_splitting(all_smpls=all_smpls)
+
+    prediction_dict = {'d': {'ml_g0': dml_obj.predictions['ml_g0'].reshape(-1, 1),
+                             'ml_g1': dml_obj.predictions['ml_g1'].reshape(-1, 1),
+                             'ml_m': dml_obj.predictions['ml_m'].reshape(-1, 1)}}
+    dml_obj_ext.fit(external_predictions=prediction_dict)
+
+
     res_dict = {'coef': dml_obj.coef,
                 'coef_manual': res_manual['theta'],
-                'coef_ext': dml_obj.coef_extern,
+                'coef_ext': dml_obj_ext.coef,
                 'se': dml_obj.se,
-                'se_manual': res_manual['se']}
+                'se_manual': res_manual['se'],
+                'se_ext': dml_obj_ext.se}
 
     return res_dict
 
@@ -100,4 +122,14 @@ def test_dml_apo_coef(dml_apo_fixture):
                         rel_tol=1e-9, abs_tol=1e-4)
     assert math.isclose(dml_apo_fixture['coef'][0],
                         dml_apo_fixture['coef_ext'][0],
-                        rel_tol=1e-9, abs_tol=1e-4)
\ No newline at end of file
+                        rel_tol=1e-9, abs_tol=1e-4)
+
+
+@pytest.mark.ci
+def test_dml_apo_se(dml_apo_fixture):
+    assert math.isclose(dml_apo_fixture['se'][0],
+                        dml_apo_fixture['se_manual'],
+                        rel_tol=1e-9, abs_tol=1e-4)
+    assert math.isclose(dml_apo_fixture['se'][0],
+                        dml_apo_fixture['se_ext'][0],
+                        rel_tol=1e-9, abs_tol=1e-4)

From b814fce6f4135c9fa747384174489112efd69bea Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Thu, 20 Jun 2024 13:57:26 +0200
Subject: [PATCH 12/98] add sensitivity and bootstrap test for apo

---
 doubleml/irm/tests/_utils_apo_manual.py | 80 ++++++++++++++++++++++
 doubleml/irm/tests/test_apo.py          | 89 +++++++++++++++++++++----
 2 files changed, 155 insertions(+), 14 deletions(-)

diff --git a/doubleml/irm/tests/_utils_apo_manual.py b/doubleml/irm/tests/_utils_apo_manual.py
index 38bb0870c..24506cf0e 100644
--- a/doubleml/irm/tests/_utils_apo_manual.py
+++ b/doubleml/irm/tests/_utils_apo_manual.py
@@ -123,3 +123,83 @@ def apo_orth(g_hat0, g_hat1, m_hat, u_hat0, u_hat1, treated, score):
 def var_apo(theta, g_hat0, g_hat1, m_hat, u_hat0, u_hat1, treated, score, n_obs):
     var = 1/n_obs * np.mean(np.power(g_hat1 + np.divide(np.multiply(treated, u_hat1), m_hat) - theta, 2))
     return var
+
+
+def boot_apo(y, d, treatment_level, thetas, ses, all_g_hat0, all_g_hat1, all_m_hat,
+             all_smpls, score, bootstrap, n_rep_boot,
+             n_rep=1, apply_cross_fitting=True, normalize_ipw=True):
+    treated = (d == treatment_level)
+    all_boot_t_stat = list()
+    for i_rep in range(n_rep):
+        smpls = all_smpls[i_rep]
+        if apply_cross_fitting:
+            n_obs = len(y)
+        else:
+            test_index = smpls[0][1]
+            n_obs = len(test_index)
+        weights = draw_weights(bootstrap, n_rep_boot, n_obs)
+        boot_t_stat = boot_apo_single_split(
+            thetas[i_rep], y, d, treated,
+            all_g_hat0[i_rep], all_g_hat1[i_rep], all_m_hat[i_rep], smpls,
+            score, ses[i_rep], weights, n_rep_boot, apply_cross_fitting, normalize_ipw)
+        all_boot_t_stat.append(boot_t_stat)
+
+    boot_t_stat = np.hstack(all_boot_t_stat)
+
+    return boot_t_stat
+
+
+def boot_apo_single_split(theta, y, d, treated, g_hat0_list, g_hat1_list, m_hat_list,
+                          smpls, score, se, weights, n_rep_boot, apply_cross_fitting, normalize_ipw):
+    _, u_hat1, _, g_hat1, m_hat = compute_residuals(
+        y, g_hat0_list, g_hat1_list, m_hat_list, smpls)
+
+    m_hat_adj = np.full_like(m_hat, np.nan, dtype='float64')
+    if normalize_ipw:
+        m_hat_adj = _normalize_ipw(m_hat, treated)
+    else:
+        m_hat_adj = m_hat
+
+    J = -1.0
+    psi = g_hat1 + np.divide(np.multiply(treated, u_hat1), m_hat_adj) - theta
+    boot_t_stat = boot_manual(psi, J, smpls, se, weights, n_rep_boot, apply_cross_fitting)
+
+    return boot_t_stat
+
+
+def fit_sensitivity_elements_apo(y, d, treatment_level, all_coef, predictions, score, n_rep):
+    n_treat = 1
+    n_obs = len(y)
+    treated = (d == treatment_level)
+
+    sigma2 = np.full(shape=(1, n_rep, n_treat), fill_value=np.nan)
+    nu2 = np.full(shape=(1, n_rep, n_treat), fill_value=np.nan)
+    psi_sigma2 = np.full(shape=(n_obs, n_rep, n_treat), fill_value=np.nan)
+    psi_nu2 = np.full(shape=(n_obs, n_rep, n_treat), fill_value=np.nan)
+
+    for i_rep in range(n_rep):
+
+        m_hat = predictions['ml_m'][:, i_rep, 0]
+        g_hat0 = predictions['ml_g0'][:, i_rep, 0]
+        g_hat1 = predictions['ml_g1'][:, i_rep, 0]
+
+        weights = np.ones_like(d)
+        weights_bar = np.ones_like(d)
+
+        sigma2_score_element = np.square(y - np.multiply(treated, g_hat1) - np.multiply(1.0-treated, g_hat0))
+        sigma2[0, i_rep, 0] = np.mean(sigma2_score_element)
+        psi_sigma2[:, i_rep, 0] = sigma2_score_element - sigma2[0, i_rep, 0]
+
+        # calc m(W,alpha) and Riesz representer
+        m_alpha = np.multiply(weights, np.multiply(weights_bar, np.divide(1.0, m_hat)))
+        rr = np.multiply(weights_bar, np.divide(treated, m_hat))
+
+        nu2_score_element = np.multiply(2.0, m_alpha) - np.square(rr)
+        nu2[0, i_rep, 0] = np.mean(nu2_score_element)
+        psi_nu2[:, i_rep, 0] = nu2_score_element - nu2[0, i_rep, 0]
+
+    element_dict = {'sigma2': sigma2,
+                    'nu2': nu2,
+                    'psi_sigma2': psi_sigma2,
+                    'psi_nu2': psi_nu2}
+    return element_dict
diff --git a/doubleml/irm/tests/test_apo.py b/doubleml/irm/tests/test_apo.py
index 1f96a5d51..535f2397d 100644
--- a/doubleml/irm/tests/test_apo.py
+++ b/doubleml/irm/tests/test_apo.py
@@ -13,7 +13,7 @@
 from doubleml.utils.resampling import DoubleMLResampling
 
 from ...tests._utils import draw_smpls
-from ._utils_apo_manual import fit_apo
+from ._utils_apo_manual import fit_apo, boot_apo, fit_sensitivity_elements_apo
 
 
 @pytest.fixture(scope='module',
@@ -42,7 +42,7 @@ def dml_apo_fixture(generate_data_irm, learner, normalize_ipw, trimming_threshol
     boot_methods = ['normal']
     n_folds = 2
     n_rep_boot = 499
-
+    treatment_level = 0
 
     # Set machine learning methods for m & g
     ml_g = clone(learner[0])
@@ -64,13 +64,13 @@ def dml_apo_fixture(generate_data_irm, learner, normalize_ipw, trimming_threshol
 
     np.random.seed(3141)
     dml_obj = dml.DoubleMLAPO(dml_data,
-                                  ml_g, ml_m,
-                                  treatment_level=0,
-                                  n_folds=n_folds,
-                                  score='APO',
-                                  normalize_ipw=normalize_ipw,
-                                  draw_sample_splitting=False,
-                                  trimming_threshold=trimming_threshold)
+                              ml_g, ml_m,
+                              treatment_level=treatment_level,
+                              n_folds=n_folds,
+                              score='APO',
+                              normalize_ipw=normalize_ipw,
+                              draw_sample_splitting=False,
+                              trimming_threshold=trimming_threshold)
 
     # synchronize the sample splitting
     dml_obj.set_sample_splitting(all_smpls=all_smpls)
@@ -79,7 +79,7 @@ def dml_apo_fixture(generate_data_irm, learner, normalize_ipw, trimming_threshol
     np.random.seed(3141)
     res_manual = fit_apo(y, x, d,
                          clone(learner[0]), clone(learner[1]),
-                         treatment_level=0,
+                         treatment_level=treatment_level,
                          all_smpls=all_smpls,
                          score='APO',
                          normalize_ipw=normalize_ipw,
@@ -89,7 +89,7 @@ def dml_apo_fixture(generate_data_irm, learner, normalize_ipw, trimming_threshol
     # test with external nuisance predictions
     dml_obj_ext = dml.DoubleMLAPO(dml_data,
                                   ml_g, ml_m,
-                                  treatment_level=0,
+                                  treatment_level=treatment_level,
                                   n_folds=n_folds,
                                   score='APO',
                                   normalize_ipw=normalize_ipw,
@@ -104,14 +104,45 @@ def dml_apo_fixture(generate_data_irm, learner, normalize_ipw, trimming_threshol
                              'ml_m': dml_obj.predictions['ml_m'].reshape(-1, 1)}}
     dml_obj_ext.fit(external_predictions=prediction_dict)
 
-
     res_dict = {'coef': dml_obj.coef,
                 'coef_manual': res_manual['theta'],
                 'coef_ext': dml_obj_ext.coef,
                 'se': dml_obj.se,
                 'se_manual': res_manual['se'],
-                'se_ext': dml_obj_ext.se}
-
+                'se_ext': dml_obj_ext.se,
+                'boot_methods': boot_methods}
+
+    for bootstrap in boot_methods:
+        np.random.seed(3141)
+        boot_t_stat = boot_apo(y, d, treatment_level, res_manual['thetas'], res_manual['ses'],
+                               res_manual['all_g_hat0'], res_manual['all_g_hat1'],
+                               res_manual['all_m_hat'],
+                               all_smpls,
+                               score='APO',
+                               bootstrap=bootstrap,
+                               n_rep_boot=n_rep_boot,
+                               normalize_ipw=normalize_ipw)
+
+        np.random.seed(3141)
+        dml_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
+        np.random.seed(3141)
+        dml_obj_ext.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
+        res_dict['boot_t_stat' + bootstrap] = dml_obj.boot_t_stat
+        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat.reshape(-1, 1, 1)
+        res_dict['boot_t_stat' + bootstrap + '_ext'] = dml_obj_ext.boot_t_stat
+
+    # check if sensitivity score with rho=0 gives equal asymptotic standard deviation
+    dml_obj.sensitivity_analysis(rho=0.0)
+    res_dict['sensitivity_ses'] = dml_obj.sensitivity_params['se']
+
+    # sensitivity tests
+    res_dict['sensitivity_elements'] = dml_obj.sensitivity_elements
+    res_dict['sensitivity_elements_manual'] = fit_sensitivity_elements_apo(y, d,
+                                                                           treatment_level,
+                                                                           all_coef=dml_obj.all_coef,
+                                                                           predictions=dml_obj.predictions,
+                                                                           score='APO',
+                                                                           n_rep=1)
     return res_dict
 
 
@@ -133,3 +164,33 @@ def test_dml_apo_se(dml_apo_fixture):
     assert math.isclose(dml_apo_fixture['se'][0],
                         dml_apo_fixture['se_ext'][0],
                         rel_tol=1e-9, abs_tol=1e-4)
+
+
+@pytest.mark.ci
+def test_dml_apo_boot(dml_apo_fixture):
+    for bootstrap in dml_apo_fixture['boot_methods']:
+        assert np.allclose(dml_apo_fixture['boot_t_stat' + bootstrap],
+                           dml_apo_fixture['boot_t_stat' + bootstrap + '_manual'],
+                           rtol=1e-9, atol=1e-4)
+        assert np.allclose(dml_apo_fixture['boot_t_stat' + bootstrap],
+                           dml_apo_fixture['boot_t_stat' + bootstrap + '_ext'],
+                           rtol=1e-9, atol=1e-4)
+
+
+@pytest.mark.ci
+def test_dml_apo_sensitivity_rho0(dml_apo_fixture):
+    assert np.allclose(dml_apo_fixture['se'],
+                       dml_apo_fixture['sensitivity_ses']['lower'],
+                       rtol=1e-9, atol=1e-4)
+    assert np.allclose(dml_apo_fixture['se'],
+                       dml_apo_fixture['sensitivity_ses']['upper'],
+                       rtol=1e-9, atol=1e-4)
+
+
+@pytest.mark.ci
+def test_dml_apo_sensitivity(dml_apo_fixture):
+    sensitivity_element_names = ['sigma2', 'nu2', 'psi_sigma2', 'psi_nu2']
+    for sensitivity_element in sensitivity_element_names:
+        assert np.allclose(dml_apo_fixture['sensitivity_elements'][sensitivity_element],
+                           dml_apo_fixture['sensitivity_elements_manual'][sensitivity_element],
+                           rtol=1e-9, atol=1e-4)

From 118f42ad4c4cedfefcebcfd0b5d985f1691e5c32 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Thu, 20 Jun 2024 14:21:55 +0200
Subject: [PATCH 13/98] add test for external predictions in apo

---
 doubleml/irm/tests/_utils_apo_manual.py       |  2 +-
 doubleml/irm/tests/test_apo.py                |  1 -
 .../tests/test_apo_external_predictions.py    | 96 +++++++++++++++++++
 3 files changed, 97 insertions(+), 2 deletions(-)
 create mode 100644 doubleml/irm/tests/test_apo_external_predictions.py

diff --git a/doubleml/irm/tests/_utils_apo_manual.py b/doubleml/irm/tests/_utils_apo_manual.py
index 24506cf0e..7b07caafe 100644
--- a/doubleml/irm/tests/_utils_apo_manual.py
+++ b/doubleml/irm/tests/_utils_apo_manual.py
@@ -2,7 +2,7 @@
 from sklearn.base import clone, is_classifier
 
 from ...tests._utils_boot import boot_manual, draw_weights
-from ...tests._utils import fit_predict, fit_predict_proba, tune_grid_search
+from ...tests._utils import fit_predict, fit_predict_proba
 
 from ...utils._estimation import _normalize_ipw
 from ...utils._checks import _check_is_propensity
diff --git a/doubleml/irm/tests/test_apo.py b/doubleml/irm/tests/test_apo.py
index 535f2397d..002c96cd4 100644
--- a/doubleml/irm/tests/test_apo.py
+++ b/doubleml/irm/tests/test_apo.py
@@ -10,7 +10,6 @@
 
 import doubleml as dml
 from doubleml.datasets import make_irm_data_discrete_treatements
-from doubleml.utils.resampling import DoubleMLResampling
 
 from ...tests._utils import draw_smpls
 from ._utils_apo_manual import fit_apo, boot_apo, fit_sensitivity_elements_apo
diff --git a/doubleml/irm/tests/test_apo_external_predictions.py b/doubleml/irm/tests/test_apo_external_predictions.py
new file mode 100644
index 000000000..533d3bffd
--- /dev/null
+++ b/doubleml/irm/tests/test_apo_external_predictions.py
@@ -0,0 +1,96 @@
+import pytest
+import numpy as np
+import pandas as pd
+import math
+
+from sklearn.linear_model import LinearRegression, LogisticRegression
+from doubleml import DoubleMLAPO, DoubleMLData
+from doubleml.datasets import make_irm_data_discrete_treatements
+from doubleml.utils import DMLDummyRegressor, DMLDummyClassifier
+
+from ...tests._utils import draw_smpls
+
+
+@pytest.fixture(scope="module", params=[1, 3])
+def n_rep(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def set_ml_m_ext(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def set_ml_g_ext(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def doubleml_apo_fixture(n_rep, set_ml_m_ext, set_ml_g_ext):
+
+    score = "APO"
+    treatment_level = 0
+    ext_predictions = {"d": {}}
+
+    np.random.seed(3141)
+    n_obs = 500
+    data_apo = make_irm_data_discrete_treatements(n_obs=n_obs)
+    df_apo = pd.DataFrame(
+        np.column_stack((data_apo['y'], data_apo['d'], data_apo['x'])),
+        columns=['y', 'd'] + ['x' + str(i) for i in range(data_apo['x'].shape[1])]
+    )
+
+    dml_data = DoubleMLData(df_apo, 'y', 'd')
+    d = data_apo['d']
+    all_smpls = draw_smpls(n_obs, n_folds=5, n_rep=n_rep, groups=d)
+
+    kwargs = {
+        "obj_dml_data": dml_data,
+        "score": score,
+        "treatment_level": treatment_level,
+        "n_rep": n_rep,
+        "draw_sample_splitting": False
+    }
+
+    dml_obj = DoubleMLAPO(ml_g=LinearRegression(), ml_m=LogisticRegression(), **kwargs)
+    dml_obj.set_sample_splitting(all_smpls=all_smpls)
+
+    np.random.seed(3141)
+    dml_obj.fit(store_predictions=True)
+
+    if set_ml_m_ext:
+        ext_predictions["d"]["ml_m"] = dml_obj.predictions["ml_m"][:, :, 0]
+        ml_m = DMLDummyClassifier()
+    else:
+        ml_m = LogisticRegression(random_state=42)
+
+    if set_ml_g_ext:
+        ext_predictions["d"]["ml_g0"] = dml_obj.predictions["ml_g0"][:, :, 0]
+        ext_predictions["d"]["ml_g1"] = dml_obj.predictions["ml_g1"][:, :, 0]
+        ml_g = DMLDummyRegressor()
+    else:
+        ml_g = LinearRegression()
+
+    dml_obj_ext = DoubleMLAPO(ml_g=ml_g, ml_m=ml_m, **kwargs)
+    dml_obj_ext.set_sample_splitting(all_smpls=all_smpls)
+
+    np.random.seed(3141)
+    dml_obj_ext.fit(external_predictions=ext_predictions)
+
+    res_dict = {
+        "coef_normal": dml_obj.coef[0],
+        "coef_ext": dml_obj_ext.coef[0]
+    }
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_doubleml_apo_coef(doubleml_apo_fixture):
+    assert math.isclose(
+        doubleml_apo_fixture["coef_normal"],
+        doubleml_apo_fixture["coef_ext"],
+        rel_tol=1e-9,
+        abs_tol=1e-4
+    )

From f1845602b068e818080d256637b2a470446d6044 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Thu, 20 Jun 2024 14:26:37 +0200
Subject: [PATCH 14/98] Update test_apo_external_predictions.py

---
 doubleml/irm/tests/test_apo_external_predictions.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doubleml/irm/tests/test_apo_external_predictions.py b/doubleml/irm/tests/test_apo_external_predictions.py
index 533d3bffd..c60ee516a 100644
--- a/doubleml/irm/tests/test_apo_external_predictions.py
+++ b/doubleml/irm/tests/test_apo_external_predictions.py
@@ -27,7 +27,7 @@ def set_ml_g_ext(request):
 
 
 @pytest.fixture(scope="module")
-def doubleml_apo_fixture(n_rep, set_ml_m_ext, set_ml_g_ext):
+def doubleml_apo_ext_fixture(n_rep, set_ml_m_ext, set_ml_g_ext):
 
     score = "APO"
     treatment_level = 0
@@ -87,10 +87,10 @@ def doubleml_apo_fixture(n_rep, set_ml_m_ext, set_ml_g_ext):
 
 
 @pytest.mark.ci
-def test_doubleml_apo_coef(doubleml_apo_fixture):
+def test_doubleml_apo_ext_coef(doubleml_apo_ext_fixture):
     assert math.isclose(
-        doubleml_apo_fixture["coef_normal"],
-        doubleml_apo_fixture["coef_ext"],
+        doubleml_apo_ext_fixture["coef_normal"],
+        doubleml_apo_ext_fixture["coef_ext"],
         rel_tol=1e-9,
         abs_tol=1e-4
     )

From 827345b1634fb2a2dbc0a8bb02340ae0aa7eb8b1 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Thu, 20 Jun 2024 14:31:47 +0200
Subject: [PATCH 15/98] add classifier unit test for apo

---
 doubleml/irm/tests/test_apo_classifier.py | 117 ++++++++++++++++++++++
 doubleml/irm/tests/test_irm_classifier.py |   1 -
 2 files changed, 117 insertions(+), 1 deletion(-)
 create mode 100644 doubleml/irm/tests/test_apo_classifier.py

diff --git a/doubleml/irm/tests/test_apo_classifier.py b/doubleml/irm/tests/test_apo_classifier.py
new file mode 100644
index 000000000..860a61ef3
--- /dev/null
+++ b/doubleml/irm/tests/test_apo_classifier.py
@@ -0,0 +1,117 @@
+import numpy as np
+import pytest
+import math
+
+from sklearn.base import clone
+
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+
+import doubleml as dml
+
+from ...tests._utils import draw_smpls
+from ._utils_apo_manual import fit_apo, boot_apo
+
+
+@pytest.fixture(scope='module',
+                params=[[LogisticRegression(solver='lbfgs', max_iter=250),
+                         LogisticRegression(solver='lbfgs', max_iter=250)],
+                        [RandomForestClassifier(max_depth=2, n_estimators=10, random_state=42),
+                         RandomForestClassifier(max_depth=2, n_estimators=10, random_state=42)]])
+def learner(request):
+    return request.param
+
+
+@pytest.fixture(scope='module',
+                params=[True, False])
+def normalize_ipw(request):
+    return request.param
+
+
+@pytest.fixture(scope='module',
+                params=[0.01, 0.05])
+def trimming_threshold(request):
+    return request.param
+
+
+@pytest.fixture(scope='module')
+def dml_apo_classifier_fixture(generate_data_irm_binary, learner, normalize_ipw, trimming_threshold):
+    boot_methods = ['normal']
+    n_folds = 2
+    n_rep_boot = 499
+
+    treatment_level = 0
+    score = "APO"
+
+    # collect data
+    (x, y, d) = generate_data_irm_binary
+    n_obs = len(y)
+    all_smpls = draw_smpls(n_obs, n_folds, n_rep=1, groups=d)
+
+    # Set machine learning methods for m & g
+    ml_g = clone(learner[0])
+    ml_m = clone(learner[1])
+
+    np.random.seed(3141)
+    obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d)
+    dml_obj = dml.DoubleMLAPO(obj_dml_data,
+                              ml_g, ml_m,
+                              treatment_level=treatment_level,
+                              n_folds=n_folds,
+                              score=score,
+                              normalize_ipw=normalize_ipw,
+                              trimming_threshold=trimming_threshold,
+                              draw_sample_splitting=False)
+    # synchronize the sample splitting
+    dml_obj.set_sample_splitting(all_smpls=all_smpls)
+    dml_obj.fit()
+
+    np.random.seed(3141)
+    res_manual = fit_apo(y, x, d,
+                         clone(learner[0]), clone(learner[1]),
+                         treatment_level,
+                         all_smpls, score,
+                         normalize_ipw=normalize_ipw, trimming_threshold=trimming_threshold)
+
+    res_dict = {'coef': dml_obj.coef,
+                'coef_manual': res_manual['theta'],
+                'se': dml_obj.se,
+                'se_manual': res_manual['se'],
+                'boot_methods': boot_methods}
+
+    for bootstrap in boot_methods:
+        np.random.seed(3141)
+        boot_t_stat = boot_apo(y, d, treatment_level, res_manual['thetas'], res_manual['ses'],
+                               res_manual['all_g_hat0'], res_manual['all_g_hat1'],
+                               res_manual['all_m_hat'],
+                               all_smpls, score, bootstrap, n_rep_boot,
+                               normalize_ipw=normalize_ipw)
+
+        np.random.seed(3141)
+        dml_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
+        res_dict['boot_t_stat' + bootstrap] = dml_obj.boot_t_stat
+        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat.reshape(-1, 1, 1)
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_dml_apo_coef(dml_apo_classifier_fixture):
+    assert math.isclose(dml_apo_classifier_fixture['coef'],
+                        dml_apo_classifier_fixture['coef_manual'],
+                        rel_tol=1e-9, abs_tol=1e-4)
+
+
+@pytest.mark.ci
+def test_dml_apo_se(dml_apo_classifier_fixture):
+    assert math.isclose(dml_apo_classifier_fixture['se'],
+                        dml_apo_classifier_fixture['se_manual'],
+                        rel_tol=1e-9, abs_tol=1e-4)
+
+
+@pytest.mark.ci
+def test_dml_apo_boot(dml_apo_classifier_fixture):
+    for bootstrap in dml_apo_classifier_fixture['boot_methods']:
+        assert np.allclose(dml_apo_classifier_fixture['boot_t_stat' + bootstrap],
+                           dml_apo_classifier_fixture['boot_t_stat' + bootstrap + '_manual'],
+                           rtol=1e-9, atol=1e-4)
diff --git a/doubleml/irm/tests/test_irm_classifier.py b/doubleml/irm/tests/test_irm_classifier.py
index 46cdfb779..cfea434d0 100644
--- a/doubleml/irm/tests/test_irm_classifier.py
+++ b/doubleml/irm/tests/test_irm_classifier.py
@@ -1,4 +1,3 @@
-
 import numpy as np
 import pytest
 import math

From 66f7df01953dde7f5ed3e7d4916e5777ae69e33d Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Thu, 20 Jun 2024 15:22:16 +0200
Subject: [PATCH 16/98] add tune with unit test for apo

---
 doubleml/irm/apo.py                     |  49 +++++++-
 doubleml/irm/tests/_utils_apo_manual.py |  22 +++-
 doubleml/irm/tests/test_apo_tune.py     | 159 ++++++++++++++++++++++++
 3 files changed, 224 insertions(+), 6 deletions(-)
 create mode 100644 doubleml/irm/tests/test_apo_tune.py

diff --git a/doubleml/irm/apo.py b/doubleml/irm/apo.py
index d841475af..c37ced88f 100644
--- a/doubleml/irm/apo.py
+++ b/doubleml/irm/apo.py
@@ -8,7 +8,7 @@
 from ..double_ml_score_mixins import LinearScoreMixin
 from ..double_ml_data import DoubleMLData
 
-from ..utils._estimation import _dml_cv_predict, _get_cond_smpls, _cond_targets, _trimm, \
+from ..utils._estimation import _dml_cv_predict, _dml_tune, _get_cond_smpls, _cond_targets, _trimm, \
     _normalize_ipw
 from ..utils._checks import _check_score, _check_trimming, _check_weights, _check_finite_predictions, \
     _check_is_propensity
@@ -280,10 +280,49 @@ def _sensitivity_element_est(self, preds):
                         }
         return element_dict
 
-    def _nuisance_tuning(self):
-        # Tune nuisance parameters
-        # This is a placeholder for tuning logic
-        print("Tuning nuisance parameters...")
+    def _nuisance_tuning(self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv,
+                         search_mode, n_iter_randomized_search):
+        x, y = check_X_y(self._dml_data.x, self._dml_data.y,
+                         force_all_finite=False)
+        x, treated = check_X_y(x, self.treated,
+                               force_all_finite=False)
+        # get train indices for d == 0 and d == 1
+        smpls_d0, smpls_d1 = _get_cond_smpls(smpls, treated)
+
+        if scoring_methods is None:
+            scoring_methods = {'ml_g': None,
+                               'ml_m': None}
+
+        train_inds = [train_index for (train_index, _) in smpls]
+        train_inds_d0 = [train_index for (train_index, _) in smpls_d0]
+        train_inds_d1 = [train_index for (train_index, _) in smpls_d1]
+        g0_tune_res = _dml_tune(y, x, train_inds_d0,
+                                self._learner['ml_g'], param_grids['ml_g'], scoring_methods['ml_g'],
+                                n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search)
+        g1_tune_res = list()
+        g1_tune_res = _dml_tune(y, x, train_inds_d1,
+                                self._learner['ml_g'], param_grids['ml_g'], scoring_methods['ml_g'],
+                                n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search)
+
+        m_tune_res = _dml_tune(treated, x, train_inds,
+                               self._learner['ml_m'], param_grids['ml_m'], scoring_methods['ml_m'],
+                               n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search)
+
+        g0_best_params = [xx.best_params_ for xx in g0_tune_res]
+        g1_best_params = [xx.best_params_ for xx in g1_tune_res]
+        m_best_params = [xx.best_params_ for xx in m_tune_res]
+
+        params = {'ml_g0': g0_best_params,
+                  'ml_g1': g1_best_params,
+                  'ml_m': m_best_params}
+        tune_res = {'g0_tune': g0_tune_res,
+                    'g1_tune': g1_tune_res,
+                    'm_tune': m_tune_res}
+
+        res = {'params': params,
+               'tune_res': tune_res}
+
+        return res
 
     def _check_data(self, obj_dml_data):
         if not isinstance(obj_dml_data, DoubleMLData):
diff --git a/doubleml/irm/tests/_utils_apo_manual.py b/doubleml/irm/tests/_utils_apo_manual.py
index 7b07caafe..bc952be49 100644
--- a/doubleml/irm/tests/_utils_apo_manual.py
+++ b/doubleml/irm/tests/_utils_apo_manual.py
@@ -2,7 +2,7 @@
 from sklearn.base import clone, is_classifier
 
 from ...tests._utils_boot import boot_manual, draw_weights
-from ...tests._utils import fit_predict, fit_predict_proba
+from ...tests._utils import fit_predict, fit_predict_proba, tune_grid_search
 
 from ...utils._estimation import _normalize_ipw
 from ...utils._checks import _check_is_propensity
@@ -203,3 +203,23 @@ def fit_sensitivity_elements_apo(y, d, treatment_level, all_coef, predictions, s
                     'psi_sigma2': psi_sigma2,
                     'psi_nu2': psi_nu2}
     return element_dict
+
+
+def tune_nuisance_apo(y, x, d, treatment_level, ml_g, ml_m, smpls, score, n_folds_tune,
+                      param_grid_g, param_grid_m):
+    train_cond0 = np.where(d != treatment_level)[0]
+    g0_tune_res = tune_grid_search(y, x, ml_g, smpls, param_grid_g, n_folds_tune,
+                                   train_cond=train_cond0)
+
+    train_cond1 = np.where(d == treatment_level)[0]
+    g1_tune_res = tune_grid_search(y, x, ml_g, smpls, param_grid_g, n_folds_tune,
+                                   train_cond=train_cond1)
+
+    treated = (d == treatment_level)
+    m_tune_res = tune_grid_search(treated, x, ml_m, smpls, param_grid_m, n_folds_tune)
+
+    g0_best_params = [xx.best_params_ for xx in g0_tune_res]
+    g1_best_params = [xx.best_params_ for xx in g1_tune_res]
+    m_best_params = [xx.best_params_ for xx in m_tune_res]
+
+    return g0_best_params, g1_best_params, m_best_params
diff --git a/doubleml/irm/tests/test_apo_tune.py b/doubleml/irm/tests/test_apo_tune.py
new file mode 100644
index 000000000..3a818fcae
--- /dev/null
+++ b/doubleml/irm/tests/test_apo_tune.py
@@ -0,0 +1,159 @@
+import numpy as np
+import pytest
+import math
+
+from sklearn.base import clone
+
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestRegressor
+
+import doubleml as dml
+
+from ...tests._utils import draw_smpls
+from ._utils_apo_manual import fit_apo, boot_apo, tune_nuisance_apo
+
+
+@pytest.fixture(scope='module',
+                params=[RandomForestRegressor(random_state=42)])
+def learner_g(request):
+    return request.param
+
+
+@pytest.fixture(scope='module',
+                params=[LogisticRegression(random_state=42)])
+def learner_m(request):
+    return request.param
+
+
+@pytest.fixture(scope='module',
+                params=['APO'])
+def score(request):
+    return request.param
+
+
+@pytest.fixture(scope='module',
+                params=[True, False])
+def normalize_ipw(request):
+    return request.param
+
+
+@pytest.fixture(scope='module',
+                params=[True, False])
+def tune_on_folds(request):
+    return request.param
+
+
+def get_par_grid(learner):
+    if learner.__class__ in [RandomForestRegressor]:
+        par_grid = {'n_estimators': [5, 10, 20]}
+    else:
+        assert learner.__class__ in [LogisticRegression]
+        par_grid = {'C': np.logspace(-4, 2, 10)}
+    return par_grid
+
+
+@pytest.fixture(scope='module')
+def dml_apo_tune_fixture(generate_data_irm, learner_g, learner_m, score, normalize_ipw, tune_on_folds):
+    par_grid = {'ml_g': get_par_grid(learner_g),
+                'ml_m': get_par_grid(learner_m)}
+    n_folds_tune = 4
+
+    boot_methods = ['normal']
+    n_folds = 2
+    n_rep_boot = 499
+    treatment_level = 0
+
+    # collect data
+    (x, y, d) = generate_data_irm
+    n_obs = len(y)
+    all_smpls = draw_smpls(n_obs, n_folds, n_rep=1, groups=d)
+
+    # Set machine learning methods for m & g
+    ml_g = clone(learner_g)
+    ml_m = clone(learner_m)
+
+    np.random.seed(3141)
+    obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d)
+    dml_obj = dml.DoubleMLAPO(obj_dml_data,
+                              ml_g, ml_m,
+                              treatment_level=treatment_level,
+                              n_folds=n_folds,
+                              score=score,
+                              normalize_ipw=normalize_ipw,
+                              draw_sample_splitting=False)
+    # synchronize the sample splitting
+    dml_obj.set_sample_splitting(all_smpls=all_smpls)
+    np.random.seed(3141)
+    # tune hyperparameters
+    tune_res = dml_obj.tune(par_grid, tune_on_folds=tune_on_folds, n_folds_tune=n_folds_tune,
+                            return_tune_res=False)
+    assert isinstance(tune_res, dml.DoubleMLAPO)
+
+    dml_obj.fit()
+
+    np.random.seed(3141)
+    smpls = all_smpls[0]
+
+    if tune_on_folds:
+        g0_params, g1_params, m_params = tune_nuisance_apo(y, x, d, treatment_level,
+                                                           clone(learner_g), clone(learner_m), smpls, score,
+                                                           n_folds_tune,
+                                                           par_grid['ml_g'], par_grid['ml_m'])
+    else:
+        xx = [(np.arange(len(y)), np.array([]))]
+        g0_params, g1_params, m_params = tune_nuisance_apo(y, x, d, treatment_level,
+                                                           clone(learner_g), clone(learner_m), xx, score,
+                                                           n_folds_tune,
+                                                           par_grid['ml_g'], par_grid['ml_m'])
+        g0_params = g0_params * n_folds
+        m_params = m_params * n_folds
+        g1_params = g1_params * n_folds
+
+    res_manual = fit_apo(y, x, d, clone(learner_g), clone(learner_m),
+                         treatment_level,
+                         all_smpls, score,
+                         normalize_ipw=normalize_ipw,
+                         g0_params=g0_params, g1_params=g1_params, m_params=m_params)
+
+    res_dict = {'coef': dml_obj.coef,
+                'coef_manual': res_manual['theta'],
+                'se': dml_obj.se,
+                'se_manual': res_manual['se'],
+                'boot_methods': boot_methods}
+
+    for bootstrap in boot_methods:
+        np.random.seed(3141)
+        boot_t_stat = boot_apo(y, d, treatment_level, res_manual['thetas'], res_manual['ses'],
+                               res_manual['all_g_hat0'], res_manual['all_g_hat1'],
+                               res_manual['all_m_hat'],
+                               all_smpls, score, bootstrap, n_rep_boot,
+                               normalize_ipw=normalize_ipw)
+
+        np.random.seed(3141)
+        dml_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
+        res_dict['boot_t_stat' + bootstrap] = dml_obj.boot_t_stat
+        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat.reshape(-1, 1, 1)
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_dml_apo_tune_coef(dml_apo_tune_fixture):
+    assert math.isclose(dml_apo_tune_fixture['coef'],
+                        dml_apo_tune_fixture['coef_manual'],
+                        rel_tol=1e-9, abs_tol=1e-4)
+
+
+@pytest.mark.ci
+def test_dml_apo_tune_se(dml_apo_tune_fixture):
+    assert math.isclose(dml_apo_tune_fixture['se'],
+                        dml_apo_tune_fixture['se_manual'],
+                        rel_tol=1e-9, abs_tol=1e-4)
+
+
+@pytest.mark.ci
+def test_dml_apo_tune_boot(dml_apo_tune_fixture):
+    for bootstrap in dml_apo_tune_fixture['boot_methods']:
+        assert np.allclose(dml_apo_tune_fixture['boot_t_stat' + bootstrap],
+                           dml_apo_tune_fixture['boot_t_stat' + bootstrap + '_manual'],
+                           rtol=1e-9, atol=1e-4)

From a43ab19a9ad076d82227db7cbfdb0d1df83f858f Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Thu, 20 Jun 2024 15:55:36 +0200
Subject: [PATCH 17/98] Create test_apo_weighted_scores.py

---
 .../irm/tests/test_apo_weighted_scores.py     | 94 +++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 doubleml/irm/tests/test_apo_weighted_scores.py

diff --git a/doubleml/irm/tests/test_apo_weighted_scores.py b/doubleml/irm/tests/test_apo_weighted_scores.py
new file mode 100644
index 000000000..36de26811
--- /dev/null
+++ b/doubleml/irm/tests/test_apo_weighted_scores.py
@@ -0,0 +1,94 @@
+import pytest
+import numpy as np
+
+from sklearn.base import clone
+from sklearn.linear_model import LogisticRegression, LinearRegression
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+
+from ...tests._utils import draw_smpls
+import doubleml as dml
+
+@pytest.fixture(scope='module',
+                params=[[LinearRegression(),
+                         LogisticRegression(solver='lbfgs', max_iter=250)],
+                        [RandomForestRegressor(max_depth=5, n_estimators=10, random_state=42),
+                         RandomForestClassifier(max_depth=5, n_estimators=10, random_state=42)]])
+def learner(request):
+    return request.param
+
+
+@pytest.fixture(scope='module',
+                params=['APO'])
+def score(request):
+    return request.param
+
+
+@pytest.fixture(scope='module',
+                params=[False, True])
+def normalize_ipw(request):
+    return request.param
+
+
+@pytest.fixture(scope='module',
+                params=[0.2, 0.15])
+def trimming_threshold(request):
+    return request.param
+
+
+@pytest.fixture(scope='module',
+                params=[0, 1])
+def treatment_level(request):
+    return request.param
+
+
+@pytest.fixture(scope='module')
+def weighted_apo_score_fixture(generate_data_irm, learner, score, normalize_ipw, trimming_threshold,
+                               treatment_level):
+    n_folds = 2
+
+    # collect data
+    (x, y, d) = generate_data_irm
+    n_obs = len(y)
+    all_smpls = draw_smpls(n_obs, n_folds, n_rep=1, groups=d)
+    obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d)
+
+    # Set machine learning methods for m & g
+    ml_g = clone(learner[0])
+    ml_m = clone(learner[1])
+
+    np.random.seed(3141)
+    dml_obj = dml.DoubleMLAPO(obj_dml_data,
+                              ml_g, ml_m,
+                              treatment_level,
+                              n_folds,
+                              score=score,
+                              normalize_ipw=normalize_ipw,
+                              trimming_threshold=trimming_threshold,
+                              draw_sample_splitting=False)
+    dml_obj.set_sample_splitting(all_smpls=all_smpls)
+    dml_obj.fit()
+
+    weights = 0.5 * np.ones_like(obj_dml_data.y)
+    dml_obj_weighted = dml.DoubleMLAPO(obj_dml_data,
+                                       ml_g, ml_m,
+                                       treatment_level,
+                                       n_folds,
+                                       score=score,
+                                       weights=weights,
+                                       normalize_ipw=normalize_ipw,
+                                       trimming_threshold=trimming_threshold,
+                                       draw_sample_splitting=False)
+    dml_obj_weighted.set_sample_splitting(all_smpls=all_smpls)
+    dml_obj_weighted.fit()
+
+    result_dict = {
+        'coef': dml_obj.coef,
+        'weighted_coef': dml_obj_weighted.coef,
+    }
+    return result_dict
+
+
+@pytest.mark.ci
+def test_apo_weighted_coef(weighted_apo_score_fixture):
+    assert np.allclose(0.5 * weighted_apo_score_fixture['coef'],
+                       weighted_apo_score_fixture['weighted_coef'])

From aed2227f2c9a1d06f3f4ccc4a4ee24de5f0d29d8 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Thu, 20 Jun 2024 16:02:16 +0200
Subject: [PATCH 18/98] Update test_apo_weighted_scores.py

---
 doubleml/irm/tests/test_apo_weighted_scores.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doubleml/irm/tests/test_apo_weighted_scores.py b/doubleml/irm/tests/test_apo_weighted_scores.py
index 36de26811..94d81170c 100644
--- a/doubleml/irm/tests/test_apo_weighted_scores.py
+++ b/doubleml/irm/tests/test_apo_weighted_scores.py
@@ -8,6 +8,7 @@
 from ...tests._utils import draw_smpls
 import doubleml as dml
 
+
 @pytest.fixture(scope='module',
                 params=[[LinearRegression(),
                          LogisticRegression(solver='lbfgs', max_iter=250)],

From 8f15923660f9277ac3487b8ccebcce61225812e9 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Thu, 20 Jun 2024 16:30:32 +0200
Subject: [PATCH 19/98] adding capo and gapo to apo class

---
 doubleml/irm/apo.py            | 70 ++++++++++++++++++++++++++++++++++
 doubleml/irm/tests/test_apo.py | 56 +++++++++++++++++++++++++--
 2 files changed, 123 insertions(+), 3 deletions(-)

diff --git a/doubleml/irm/apo.py b/doubleml/irm/apo.py
index c37ced88f..e530625de 100644
--- a/doubleml/irm/apo.py
+++ b/doubleml/irm/apo.py
@@ -1,10 +1,13 @@
 import numpy as np
+import pandas as pd
+import warnings
 
 from sklearn.utils import check_X_y
 from sklearn.utils.multiclass import type_of_target
 
 from ..double_ml import DoubleML
 
+from ..utils.blp import DoubleMLBLP
 from ..double_ml_score_mixins import LinearScoreMixin
 from ..double_ml_data import DoubleMLData
 
@@ -341,3 +344,70 @@ def _check_data(self, obj_dml_data):
             )
 
         return
+
+    def capo(self, basis, is_gate=False):
+        """
+        Calculate conditional average potential outcomes (CAPO) for a given basis.
+
+        Parameters
+        ----------
+        basis : :class:`pandas.DataFrame`
+            The basis for estimating the best linear predictor. Has to have the shape ``(n_obs, d)``,
+            where ``n_obs`` is the number of observations and ``d`` is the number of predictors.
+        is_gate : bool
+            Indicates whether the basis is constructed for GATE/GAPOs (dummy-basis).
+            Default is ``False``.
+
+        Returns
+        -------
+        model : :class:`doubleML.DoubleMLBLP`
+            Best linear Predictor model.
+        """
+        valid_score = ['APO']
+        if self.score not in valid_score:
+            raise ValueError('Invalid score ' + self.score + '. ' +
+                             'Valid score ' + ' or '.join(valid_score) + '.')
+
+        if self.n_rep != 1:
+            raise NotImplementedError('Only implemented for one repetition. ' +
+                                      f'Number of repetitions is {str(self.n_rep)}.')
+
+        # define the orthogonal signal
+        orth_signal = self.psi_elements['psi_b'].reshape(-1)
+        # fit the best linear predictor
+        model = DoubleMLBLP(orth_signal, basis=basis, is_gate=is_gate)
+        model.fit()
+        return model
+
+    def gapo(self, groups):
+        """
+        Calculate group average potential outcomes (GAPO) for groups.
+
+        Parameters
+        ----------
+        groups : :class:`pandas.DataFrame`
+            The group indicator for estimating the best linear predictor. Groups should be mutually exclusive.
+            Has to be dummy coded with shape ``(n_obs, d)``, where ``n_obs`` is the number of observations
+            and ``d`` is the number of groups or ``(n_obs, 1)`` and contain the corresponding groups (as str).
+
+        Returns
+        -------
+        model : :class:`doubleML.DoubleMLBLP`
+            Best linear Predictor model for group average potential outcomes.
+        """
+        if not isinstance(groups, pd.DataFrame):
+            raise TypeError('Groups must be of DataFrame type. '
+                            f'Groups of type {str(type(groups))} was passed.')
+
+        if not all(groups.dtypes == bool) or all(groups.dtypes == int):
+            if groups.shape[1] == 1:
+                groups = pd.get_dummies(groups, prefix='Group', prefix_sep='_')
+            else:
+                raise TypeError('Columns of groups must be of bool type or int type (dummy coded). '
+                                'Alternatively, groups should only contain one column.')
+
+        if any(groups.sum(0) <= 5):
+            warnings.warn('At least one group effect is estimated with less than 6 observations.')
+
+        model = self.capo(groups, is_gate=True)
+        return model
diff --git a/doubleml/irm/tests/test_apo.py b/doubleml/irm/tests/test_apo.py
index 002c96cd4..20effa0cb 100644
--- a/doubleml/irm/tests/test_apo.py
+++ b/doubleml/irm/tests/test_apo.py
@@ -9,7 +9,7 @@
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 
 import doubleml as dml
-from doubleml.datasets import make_irm_data_discrete_treatements
+from doubleml.datasets import make_irm_data_discrete_treatements, make_irm_data
 
 from ...tests._utils import draw_smpls
 from ._utils_apo_manual import fit_apo, boot_apo, fit_sensitivity_elements_apo
@@ -36,12 +36,17 @@ def trimming_threshold(request):
     return request.param
 
 
+@pytest.fixture(scope='module',
+                params=[0, 1])
+def treatment_level(request):
+    return request.param
+
+
 @pytest.fixture(scope='module')
-def dml_apo_fixture(generate_data_irm, learner, normalize_ipw, trimming_threshold):
+def dml_apo_fixture(generate_data_irm, learner, normalize_ipw, trimming_threshold, treatment_level):
     boot_methods = ['normal']
     n_folds = 2
     n_rep_boot = 499
-    treatment_level = 0
 
     # Set machine learning methods for m & g
     ml_g = clone(learner[0])
@@ -193,3 +198,48 @@ def test_dml_apo_sensitivity(dml_apo_fixture):
         assert np.allclose(dml_apo_fixture['sensitivity_elements'][sensitivity_element],
                            dml_apo_fixture['sensitivity_elements_manual'][sensitivity_element],
                            rtol=1e-9, atol=1e-4)
+
+
+@pytest.mark.ci
+def test_dml_apo_capo_gapo(treatment_level):
+    n = 20
+    # collect data
+    np.random.seed(42)
+    obj_dml_data = make_irm_data(n_obs=n, dim_x=2)
+
+    # First stage estimation
+    ml_g = RandomForestRegressor(n_estimators=10)
+    ml_m = RandomForestClassifier(n_estimators=10)
+
+    dml_obj = dml.DoubleMLAPO(obj_dml_data,
+                              ml_m=ml_m,
+                              ml_g=ml_g,
+                              treatment_level=treatment_level,
+                              trimming_threshold=0.05,
+                              n_folds=5)
+
+    dml_obj.fit()
+    # create a random basis
+    random_basis = pd.DataFrame(np.random.normal(0, 1, size=(n, 5)))
+    capo = dml_obj.capo(random_basis)
+    assert isinstance(capo, dml.utils.blp.DoubleMLBLP)
+    assert isinstance(capo.confint(), pd.DataFrame)
+
+    groups_1 = pd.DataFrame(np.column_stack([obj_dml_data.data['X1'] <= -1.0,
+                                             obj_dml_data.data['X1'] > 0.2]),
+                            columns=['Group 1', 'Group 2'])
+    msg = ('At least one group effect is estimated with less than 6 observations.')
+    with pytest.warns(UserWarning, match=msg):
+        gapo_1 = dml_obj.gapo(groups_1)
+    assert isinstance(gapo_1, dml.utils.blp.DoubleMLBLP)
+    assert isinstance(gapo_1.confint(), pd.DataFrame)
+    assert all(gapo_1.confint().index == groups_1.columns.to_list())
+
+    np.random.seed(42)
+    groups_2 = pd.DataFrame(np.random.choice(["1", "2"], n, p=[0.1, 0.9]))
+    msg = ('At least one group effect is estimated with less than 6 observations.')
+    with pytest.warns(UserWarning, match=msg):
+        gapo_2 = dml_obj.gapo(groups_2)
+    assert isinstance(gapo_2, dml.utils.blp.DoubleMLBLP)
+    assert isinstance(gapo_2.confint(), pd.DataFrame)
+    assert all(gapo_2.confint().index == ["Group_1", "Group_2"])

From 134fc4ced66d7ada89ada97da116de066bcd7c15 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Thu, 20 Jun 2024 16:43:18 +0200
Subject: [PATCH 20/98] small fixes to remove unnecessary lines

---
 doubleml/irm/apo.py                     | 2 --
 doubleml/irm/irm.py                     | 2 --
 doubleml/irm/tests/_utils_apo_manual.py | 1 -
 3 files changed, 5 deletions(-)

diff --git a/doubleml/irm/apo.py b/doubleml/irm/apo.py
index e530625de..76a0372f1 100644
--- a/doubleml/irm/apo.py
+++ b/doubleml/irm/apo.py
@@ -239,7 +239,6 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
         return psi_elements, preds
 
     def _score_elements(self, y, treated, g_hat0, g_hat1, m_hat, smpls):
-        m_hat_adj = np.full_like(m_hat, np.nan, dtype='float64')
         if self.normalize_ipw:
             m_hat_adj = _normalize_ipw(m_hat, treated)
         else:
@@ -302,7 +301,6 @@ def _nuisance_tuning(self, smpls, param_grids, scoring_methods, n_folds_tune, n_
         g0_tune_res = _dml_tune(y, x, train_inds_d0,
                                 self._learner['ml_g'], param_grids['ml_g'], scoring_methods['ml_g'],
                                 n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search)
-        g1_tune_res = list()
         g1_tune_res = _dml_tune(y, x, train_inds_d1,
                                 self._learner['ml_g'], param_grids['ml_g'], scoring_methods['ml_g'],
                                 n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search)
diff --git a/doubleml/irm/irm.py b/doubleml/irm/irm.py
index 0fa6ec749..3cf98ec36 100644
--- a/doubleml/irm/irm.py
+++ b/doubleml/irm/irm.py
@@ -340,7 +340,6 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
 
     def _score_elements(self, y, d, g_hat0, g_hat1, m_hat, smpls):
 
-        m_hat_adj = np.full_like(m_hat, np.nan, dtype='float64')
         if self.normalize_ipw:
             m_hat_adj = _normalize_ipw(m_hat, d)
         else:
@@ -420,7 +419,6 @@ def _nuisance_tuning(self, smpls, param_grids, scoring_methods, n_folds_tune, n_
         g0_tune_res = _dml_tune(y, x, train_inds_d0,
                                 self._learner['ml_g'], param_grids['ml_g'], scoring_methods['ml_g'],
                                 n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search)
-        g1_tune_res = list()
         g1_tune_res = _dml_tune(y, x, train_inds_d1,
                                 self._learner['ml_g'], param_grids['ml_g'], scoring_methods['ml_g'],
                                 n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search)
diff --git a/doubleml/irm/tests/_utils_apo_manual.py b/doubleml/irm/tests/_utils_apo_manual.py
index bc952be49..862a2793d 100644
--- a/doubleml/irm/tests/_utils_apo_manual.py
+++ b/doubleml/irm/tests/_utils_apo_manual.py
@@ -154,7 +154,6 @@ def boot_apo_single_split(theta, y, d, treated, g_hat0_list, g_hat1_list, m_hat_
     _, u_hat1, _, g_hat1, m_hat = compute_residuals(
         y, g_hat0_list, g_hat1_list, m_hat_list, smpls)
 
-    m_hat_adj = np.full_like(m_hat, np.nan, dtype='float64')
     if normalize_ipw:
         m_hat_adj = _normalize_ipw(m_hat, treated)
     else:

From 0d5ecd5e1b40bbb3016503b983e9d21c4496ee11 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Fri, 21 Jun 2024 11:25:33 +0200
Subject: [PATCH 21/98] update dgp

---
 doubleml/datasets.py           | 149 +++++++++++++++++++++++++--------
 doubleml/irm/tests/test_apo.py |   2 +-
 2 files changed, 117 insertions(+), 34 deletions(-)

diff --git a/doubleml/datasets.py b/doubleml/datasets.py
index cfbebdd99..346d79376 100644
--- a/doubleml/datasets.py
+++ b/doubleml/datasets.py
@@ -1435,60 +1435,143 @@ def make_ssm_data(n_obs=8000, dim_x=100, theta=1, mar=True, return_type='DoubleM
         raise ValueError('Invalid return_type.')
 
 
-def make_irm_data_discrete_treatements(n_obs=200, p=10, support_size=5, n_levels=3, random_state=42):
+def make_irm_data_discrete_treatements(n_obs=200, n_levels=3, random_state=42, **kwargs):
     """
-    Generates data from a interactive regression (IRM) model with multiple treatment levels.
+    Generates data from a interactive regression (IRM) model with multiple treatment levels (based on an
+    underlying continous treatment).
+
+    The data generating process is defined as follows (similar to the Monte Carlo simulation used
+    in Sant'Anna and Zhao (2020)).
+
+    Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, where  :math:`\\Sigma` corresponds
+    to the identity matrix.
+    Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`,
+    where
+
+    .. math::
+
+            \\tilde{Z}_1 &= \\exp(0.5 \\cdot X_1)
+
+            \\tilde{Z}_2 &= 10 + X_2/(1 + \\exp(X_1))
+
+            \\tilde{Z}_3 &= (0.6 + X_1 \\cdot X_3 / 25)^3
+
+            \\tilde{Z}_4 &= (20 + X_2 + X_4)^2
+
+            \\tilde{Z}_5 &= X_5.
+
+    A continuous treatment :math:`D_{\\text{cont}}` is generated as
+
+    .. math::
+
+        D_{\\text{cont}} = \\xi (-Z_1 + 0.5 Z_2 - 0.25 Z_3 - 0.1 Z_4) + \\varepsilon_D,
+
+    where :math:`\\varepsilon_D \\sim \\mathcal{N}(0,1)` and :math:`\\xi=0.3`. The corresponding treatment
+    effect is defined as
+
+    .. math::
+
+        \\text{\\theta}(d) = 0.1 \\exp(d) + 10 \\sin(0.7 d) + 2 d - 0.2 d^2.
+
+    Based on the continous treatment, a discrete treatment :math:`D` is generated as with a baseline level of
+    :math:`D=0` and additional levels based on the quantiles of :math:`D_{\\text{cont}}`. The number of levels
+    is defined by :math:`n_{\\text{levels}}`. Each level is chosen to have the same probability of being selected.
+
+    The potential outcomes are defined as
+
+    .. math::
+
+            Y(0) &= 210 + 27.4 Z_1 + 13.7 (Z_2 + Z_3 + Z_4) + \\varepsilon_Y
+
+            Y(1) &= \\text{\\theta}(D_{\\text{cont}}) 1\\{D_{\\text{cont}} > 0\\} + Y(0),
+
+    where :math:`\\varepsilon_Y \\sim \\mathcal{N}(0,5)`. Further, the observed outcome is defined as
+
+    .. math::
+
+        Y = Y(1) 1\\{D > 0\\} + Y(0) 1\\{D = 0\\}.
+
+    The data is returned as a dictionary with the entries ``x``, ``y``, ``d`` and ``oracle_values``.
+
+    Parameters
+    ----------
+    n_obs : int
+        The number of observations to simulate.
+        Default is ``200``.
+
+    n_levels : int
+        The number of treatment levels.
+        Default is ``3``.
+
+    random_state : int
+        Random seed for reproducibility.
+        Default is ``42``.
+
+    Returns
+    -------
+    res_dict : dictionary
+       Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``.
+
     """
 
     np.random.seed(random_state)
+    xi = kwargs.get('xi', 0.3)
+    c = kwargs.get('c', 0.0)
+    dim_x = kwargs.get('dim_x', 5)
 
-    # define continous treatment effect
-    def treatment_effect(x):
-        return np.exp(2 * x[:, 0]) + 3 * np.sin(4 * x[:, 0])
+    # observed covariates
+    cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)])
+    x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ])
 
-    # Outcome support and coefficients
-    support_y = np.random.choice(np.arange(p), size=support_size, replace=False)
-    coefs_y = np.random.uniform(0, 1, size=support_size)
-    # treatment support and coefficients
-    support_d = support_y
-    range_coefs_d = [0.2, 0.3]
-    coefs_d = np.random.uniform(range_coefs_d[0], range_coefs_d[1], size=support_size)
+    def f_reg(w):
+        res = 210 + 27.4*w[:, 0] + 13.7*(w[:, 1] + w[:, 2] + w[:, 3])
+        return res
 
-    # noise
-    epsilon = np.random.uniform(-1, 1, size=n_obs)
+    def f_treatment(w, xi):
+        res = xi * (-w[:, 0] + 0.5*w[:, 1] - 0.25*w[:, 2] - 0.1*w[:, 3])
+        return res
 
-    # Generate controls, covariates, treatments and outcomes
-    x = np.random.uniform(0, 1, size=(n_obs, p))
-    # Heterogeneous treatment effects
-    te = treatment_effect(x)
+    def treatment_effect(d):
+        return 0.1 * np.exp(d) + 10 * np.sin(0.7 * d) + 2 * d - 0.2 * np.square(d)
+
+    z_tilde_1 = np.exp(0.5*x[:, 0])
+    z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0]))
+    z_tilde_3 = (0.6 + x[:, 0] * x[:, 2]/25)**3
+    z_tilde_4 = (20 + x[:, 1] + x[:, 3])**2
+
+    z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, x[:, 4:]))
+    z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0)
 
-    # set d to be a discrete number of levels
-    range_cont_d = support_size * range_coefs_d
-    # devide the range into n_levels
-    levels = np.linspace(range_cont_d[0], range_cont_d[1], n_levels - 1)
+    # error terms
+    var_eps_y = 5
+    eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs)
+    var_eps_d = 1
+    eps_d = np.random.normal(loc=0, scale=np.sqrt(var_eps_d), size=n_obs)
 
-    # define a discrete treatment version (with a baseline probability)
+    cont_d = f_treatment(z, xi) + eps_d
+    level_bounds = np.quantile(cont_d, q=np.linspace(0, 1, n_levels + 1))
+    potential_level = sum([1.0 * (cont_d >= bound) for bound in level_bounds[1:-1]]) + 1
     eta = np.random.uniform(0, 1, size=n_obs)
-    potential_level = sum([1.0 * (np.dot(x[:, support_d], coefs_d) >= level) for level in levels]) + 1
-    d = 1.0 * (eta >= 1/n_levels) * potential_level
+    observed_d = 1.0 * (eta >= 1/n_levels) * potential_level
 
+    ite = treatment_effect(cont_d)
+    y0 = f_reg(z) + eps_y
     # only treated for d > 0 compared to the baseline
-    y = te * (d > 0) + np.dot(x[:, support_y], coefs_y) + epsilon
+    y = ite * (observed_d > 0) + y0
 
     oracle_values = {
-        'levels': levels,
-        'support_y': support_y,
-        'coefs_y': coefs_y,
-        'support_d': support_d,
-        'coefs_d': coefs_d,
-        'te': te,
+        'cont_d': cont_d,
+        'level_bounds': level_bounds,
+        'potential_level': potential_level,
+        'ite': ite,
+        'y0': y0,
         'treatment_effect': treatment_effect
     }
 
     resul_dict = {
         'x': x,
         'y': y,
-        'd': d,
+        'd': observed_d,
         'oracle_values': oracle_values
     }
 
diff --git a/doubleml/irm/tests/test_apo.py b/doubleml/irm/tests/test_apo.py
index 20effa0cb..e365bc1e3 100644
--- a/doubleml/irm/tests/test_apo.py
+++ b/doubleml/irm/tests/test_apo.py
@@ -53,7 +53,7 @@ def dml_apo_fixture(generate_data_irm, learner, normalize_ipw, trimming_threshol
     ml_m = clone(learner[1])
 
     np.random.seed(3141)
-    n_obs = 100
+    n_obs = 500
     data_apo = make_irm_data_discrete_treatements(n_obs=n_obs)
     y = data_apo['y']
     x = data_apo['x']

From b86f8aaf4e5ac6c54c6c66c7f73e36d5f06386b3 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Fri, 21 Jun 2024 11:32:47 +0200
Subject: [PATCH 22/98] fix typo

---
 doubleml/datasets.py                                | 2 +-
 doubleml/irm/tests/test_apo.py                      | 4 ++--
 doubleml/irm/tests/test_apo_exceptions.py           | 4 ++--
 doubleml/irm/tests/test_apo_external_predictions.py | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/doubleml/datasets.py b/doubleml/datasets.py
index 346d79376..ea782bda5 100644
--- a/doubleml/datasets.py
+++ b/doubleml/datasets.py
@@ -1435,7 +1435,7 @@ def make_ssm_data(n_obs=8000, dim_x=100, theta=1, mar=True, return_type='DoubleM
         raise ValueError('Invalid return_type.')
 
 
-def make_irm_data_discrete_treatements(n_obs=200, n_levels=3, random_state=42, **kwargs):
+def make_irm_data_discrete_treatments(n_obs=200, n_levels=3, random_state=42, **kwargs):
     """
     Generates data from a interactive regression (IRM) model with multiple treatment levels (based on an
     underlying continous treatment).
diff --git a/doubleml/irm/tests/test_apo.py b/doubleml/irm/tests/test_apo.py
index e365bc1e3..7082e399b 100644
--- a/doubleml/irm/tests/test_apo.py
+++ b/doubleml/irm/tests/test_apo.py
@@ -9,7 +9,7 @@
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 
 import doubleml as dml
-from doubleml.datasets import make_irm_data_discrete_treatements, make_irm_data
+from doubleml.datasets import make_irm_data_discrete_treatments, make_irm_data
 
 from ...tests._utils import draw_smpls
 from ._utils_apo_manual import fit_apo, boot_apo, fit_sensitivity_elements_apo
@@ -54,7 +54,7 @@ def dml_apo_fixture(generate_data_irm, learner, normalize_ipw, trimming_threshol
 
     np.random.seed(3141)
     n_obs = 500
-    data_apo = make_irm_data_discrete_treatements(n_obs=n_obs)
+    data_apo = make_irm_data_discrete_treatments(n_obs=n_obs)
     y = data_apo['y']
     x = data_apo['x']
     d = data_apo['d']
diff --git a/doubleml/irm/tests/test_apo_exceptions.py b/doubleml/irm/tests/test_apo_exceptions.py
index cf5227957..ccab71855 100644
--- a/doubleml/irm/tests/test_apo_exceptions.py
+++ b/doubleml/irm/tests/test_apo_exceptions.py
@@ -3,12 +3,12 @@
 import numpy as np
 
 from doubleml import DoubleMLAPO, DoubleMLData
-from doubleml.datasets import make_irm_data_discrete_treatements, make_iivm_data
+from doubleml.datasets import make_irm_data_discrete_treatments, make_iivm_data
 
 from sklearn.linear_model import Lasso, LogisticRegression
 
 n = 100
-data_apo = make_irm_data_discrete_treatements(n_obs=n)
+data_apo = make_irm_data_discrete_treatments(n_obs=n)
 df_apo = pd.DataFrame(np.column_stack((data_apo['y'], data_apo['d'], data_apo['x'])),
                       columns=['y', 'd'] + ['x' + str(i) for i in range(data_apo['x'].shape[1])])
 
diff --git a/doubleml/irm/tests/test_apo_external_predictions.py b/doubleml/irm/tests/test_apo_external_predictions.py
index c60ee516a..a3f77dea1 100644
--- a/doubleml/irm/tests/test_apo_external_predictions.py
+++ b/doubleml/irm/tests/test_apo_external_predictions.py
@@ -5,7 +5,7 @@
 
 from sklearn.linear_model import LinearRegression, LogisticRegression
 from doubleml import DoubleMLAPO, DoubleMLData
-from doubleml.datasets import make_irm_data_discrete_treatements
+from doubleml.datasets import make_irm_data_discrete_treatments
 from doubleml.utils import DMLDummyRegressor, DMLDummyClassifier
 
 from ...tests._utils import draw_smpls
@@ -35,7 +35,7 @@ def doubleml_apo_ext_fixture(n_rep, set_ml_m_ext, set_ml_g_ext):
 
     np.random.seed(3141)
     n_obs = 500
-    data_apo = make_irm_data_discrete_treatements(n_obs=n_obs)
+    data_apo = make_irm_data_discrete_treatments(n_obs=n_obs)
     df_apo = pd.DataFrame(
         np.column_stack((data_apo['y'], data_apo['d'], data_apo['x'])),
         columns=['y', 'd'] + ['x' + str(i) for i in range(data_apo['x'].shape[1])]

From 70b1b00253a1607467aca5187fd1f325fa229b94 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Fri, 21 Jun 2024 11:35:14 +0200
Subject: [PATCH 23/98] remove seed from dgp

---
 doubleml/datasets.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doubleml/datasets.py b/doubleml/datasets.py
index ea782bda5..fe06ad3d1 100644
--- a/doubleml/datasets.py
+++ b/doubleml/datasets.py
@@ -1435,7 +1435,7 @@ def make_ssm_data(n_obs=8000, dim_x=100, theta=1, mar=True, return_type='DoubleM
         raise ValueError('Invalid return_type.')
 
 
-def make_irm_data_discrete_treatments(n_obs=200, n_levels=3, random_state=42, **kwargs):
+def make_irm_data_discrete_treatments(n_obs=200, n_levels=3, random_state=None, **kwargs):
     """
     Generates data from a interactive regression (IRM) model with multiple treatment levels (based on an
     underlying continous treatment).
@@ -1513,8 +1513,8 @@ def make_irm_data_discrete_treatments(n_obs=200, n_levels=3, random_state=42, **
        Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``.
 
     """
-
-    np.random.seed(random_state)
+    if random_state is not None:
+        np.random.seed(random_state)
     xi = kwargs.get('xi', 0.3)
     c = kwargs.get('c', 0.0)
     dim_x = kwargs.get('dim_x', 5)

From 5be62d62db187e7a2070dfc3edfcbc407608f4ff Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Fri, 21 Jun 2024 11:40:51 +0200
Subject: [PATCH 24/98] Add basic unit tests for dgp

---
 doubleml/datasets.py            |  6 +++++-
 doubleml/tests/test_datasets.py | 34 ++++++++++++++++++++++++++++++++-
 2 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/doubleml/datasets.py b/doubleml/datasets.py
index fe06ad3d1..fd18affdc 100644
--- a/doubleml/datasets.py
+++ b/doubleml/datasets.py
@@ -1519,6 +1519,11 @@ def make_irm_data_discrete_treatments(n_obs=200, n_levels=3, random_state=None,
     c = kwargs.get('c', 0.0)
     dim_x = kwargs.get('dim_x', 5)
 
+    if not isinstance(n_levels, int):
+        raise ValueError('n_levels must be an integer.')
+    if n_levels < 2:
+        raise ValueError('n_levels must be at least 2.')
+
     # observed covariates
     cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)])
     x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ])
@@ -1565,7 +1570,6 @@ def treatment_effect(d):
         'potential_level': potential_level,
         'ite': ite,
         'y0': y0,
-        'treatment_effect': treatment_effect
     }
 
     resul_dict = {
diff --git a/doubleml/tests/test_datasets.py b/doubleml/tests/test_datasets.py
index d662cd075..a46754d05 100644
--- a/doubleml/tests/test_datasets.py
+++ b/doubleml/tests/test_datasets.py
@@ -5,7 +5,8 @@
 from doubleml import DoubleMLData, DoubleMLClusterData
 from doubleml.datasets import fetch_401K, fetch_bonus, make_plr_CCDDHNR2018, make_plr_turrell2018, \
     make_irm_data, make_iivm_data, _make_pliv_data, make_pliv_CHS2015, make_pliv_multiway_cluster_CKMS2021, \
-    make_did_SZ2020, make_confounded_irm_data, make_confounded_plr_data, make_heterogeneous_data, make_ssm_data
+    make_did_SZ2020, make_confounded_irm_data, make_confounded_plr_data, make_heterogeneous_data, make_ssm_data, \
+    make_irm_data_discrete_treatments
 
 msg_inv_return_type = 'Invalid return_type.'
 
@@ -277,3 +278,34 @@ def test_make_ssm_data_return_types():
     assert isinstance(s, np.ndarray)
     with pytest.raises(ValueError, match=msg_inv_return_type):
         _ = make_ssm_data(n_obs=100, return_type='matrix')
+
+
+@pytest.fixture(scope='function',
+                params=[3, 5])
+def n_levels(request):
+    return request.param
+
+
+def test_make_data_discrete_treatments(n_levels):
+    np.random.seed(3141)
+    n = 100
+    data_apo = make_irm_data_discrete_treatments(n_obs=n, n_levels=3)
+    assert isinstance(data_apo, dict)
+    assert isinstance(data_apo['y'], np.ndarray)
+    assert isinstance(data_apo['d'], np.ndarray)
+    assert isinstance(data_apo['x'], np.ndarray)
+    assert isinstance(data_apo['oracle_values'], dict)
+
+    assert isinstance(data_apo['oracle_values']['cont_d'], np.ndarray)
+    assert isinstance(data_apo['oracle_values']['level_bounds'], np.ndarray)
+    assert isinstance(data_apo['oracle_values']['potential_level'], np.ndarray)
+    assert isinstance(data_apo['oracle_values']['ite'], np.ndarray)
+    assert isinstance(data_apo['oracle_values']['y0'], np.ndarray)
+
+    msg = 'n_levels must be at least 2.'
+    with pytest.raises(ValueError, match=msg):
+        _ = make_irm_data_discrete_treatments(n_obs=n, n_levels=1)
+
+    msg = 'n_levels must be an integer.'
+    with pytest.raises(ValueError, match=msg):
+        _ = make_irm_data_discrete_treatments(n_obs=n, n_levels=1.1)

From 2a1903f00c26ac1b0f8b46df2d9b4bc7f95bfaec Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Fri, 21 Jun 2024 12:10:03 +0200
Subject: [PATCH 25/98] add docstring for APO model

---
 doubleml/irm/apo.py | 57 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 56 insertions(+), 1 deletion(-)

diff --git a/doubleml/irm/apo.py b/doubleml/irm/apo.py
index 76a0372f1..7fae6c292 100644
--- a/doubleml/irm/apo.py
+++ b/doubleml/irm/apo.py
@@ -18,9 +18,64 @@
 
 
 class DoubleMLAPO(LinearScoreMixin, DoubleML):
-    """Double machine learning average potential outcomes for interactive regression models
+    """Double machine learning average potential outcomes for interactive regression models.
 
     Parameters
+    ----------
+    obj_dml_data : :class:`DoubleMLData` object
+        The :class:`DoubleMLData` object providing the data and specifying the variables for the causal model.
+
+    ml_g : estimator implementing ``fit()`` and ``predict()``
+        A machine learner implementing ``fit()`` and ``predict()`` methods (e.g.
+        :py:class:`sklearn.ensemble.RandomForestRegressor`) for the nuisance function :math:`g_0(D,X) = E[Y|X,D]`.
+        For a binary outcome variable :math:`Y` (with values 0 and 1), a classifier implementing ``fit()`` and
+        ``predict_proba()`` can also be specified. If :py:func:`sklearn.base.is_classifier` returns ``True``,
+        ``predict_proba()`` is used otherwise ``predict()``.
+
+    ml_m : classifier implementing ``fit()`` and ``predict_proba()``
+        A machine learner implementing ``fit()`` and ``predict_proba()`` methods (e.g.
+        :py:class:`sklearn.ensemble.RandomForestClassifier`) for the nuisance function :math:`m_0(X) = E[D|X]`.
+
+    treatment_level : int or float
+        Chosen treatment level for average potential outcomes.
+
+    n_folds : int
+        Number of folds.
+        Default is ``5``.
+
+    n_rep : int
+        Number of repetitons for the sample splitting.
+        Default is ``1``.
+
+    score : str or callable
+        A str (``'APO'``) specifying the score function.
+        Default is ``'APO'``.
+
+    weights : array, dict or None
+        An numpy array of weights for each individual observation. If None, then the ``'APO'`` score
+        is applied (corresponds to weights equal to 1).
+        An array has to be of shape ``(n,)``, where ``n`` is the number of observations.
+        A dictionary can be used to specify weights which depend on the treatment variable.
+        In this case, the dictionary has to contain two keys ``weights`` and ``weights_bar``, where the values
+        have to be arrays of shape ``(n,)`` and ``(n, n_rep)``.
+        Default is ``None``.
+
+    normalize_ipw : bool
+        Indicates whether the inverse probability weights are normalized.
+        Default is ``False``.
+
+    trimming_rule : str
+        A str (``'truncate'`` is the only choice) specifying the trimming approach.
+        Default is ``'truncate'``.
+
+    trimming_threshold : float
+        The threshold used for trimming.
+        Default is ``1e-2``.
+
+    draw_sample_splitting : bool
+        Indicates whether the sample splitting should be drawn during initialization of the object.
+        Default is ``True``.
+
     """
     def __init__(self,
                  obj_dml_data,

From 87cfa7665cacecb96ed535074a58e509f57caa5c Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Fri, 21 Jun 2024 12:51:32 +0200
Subject: [PATCH 26/98] add warning for low percentange of treatment level

---
 doubleml/irm/apo.py                       |  4 ++++
 doubleml/irm/tests/test_apo_exceptions.py | 13 +++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/doubleml/irm/apo.py b/doubleml/irm/apo.py
index 7fae6c292..423f3fa7e 100644
--- a/doubleml/irm/apo.py
+++ b/doubleml/irm/apo.py
@@ -396,6 +396,10 @@ def _check_data(self, obj_dml_data):
                 f'Number of treated observations: {np.sum(self.treated)} for treatment level {self.treatment_level}.'
             )
 
+        if np.mean(self.treated) <= 0.05:
+            warnings.warn(f'The proportion of observations with treatment level {self.treatment_level} is less than 5%.'
+                          f' Got {np.mean(self.treated) * 100:.2f}%.')
+
         return
 
     def capo(self, basis, is_gate=False):
diff --git a/doubleml/irm/tests/test_apo_exceptions.py b/doubleml/irm/tests/test_apo_exceptions.py
index ccab71855..2e9a7a1bd 100644
--- a/doubleml/irm/tests/test_apo_exceptions.py
+++ b/doubleml/irm/tests/test_apo_exceptions.py
@@ -33,6 +33,19 @@ def test_apo_exception_data():
     with pytest.raises(ValueError, match=msg):
         _ = DoubleMLAPO(dml_data, ml_g, ml_m, treatment_level=1.1)
 
+    msg = r'The proportion of observations with treatment level 42 is less than 5\%. Got 0.70\%.'
+    # test warning
+    with pytest.warns(UserWarning, match=msg):
+        data_apo_warn = make_irm_data_discrete_treatments(n_obs=1000)
+        data_apo_warn['d'][0:7] = 42
+        df_apo_warn = pd.DataFrame(
+            np.column_stack((data_apo_warn['y'], data_apo_warn['d'], data_apo_warn['x'])),
+            columns=['y', 'd'] + ['x' + str(i) for i in range(data_apo_warn['x'].shape[1])]
+        )
+        dml_data_warn = DoubleMLData(df_apo_warn, 'y', 'd')
+
+        _ = DoubleMLAPO(dml_data_warn, ml_g, ml_m, treatment_level=42)
+
 
 @pytest.mark.ci
 def test_apo_exception_scores():

From 5c0501c8e0d0c751c9d53a14db91f47b87040664 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Fri, 21 Jun 2024 13:22:06 +0200
Subject: [PATCH 27/98] remove double check

---
 doubleml/irm/qte.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doubleml/irm/qte.py b/doubleml/irm/qte.py
index 9fd220f19..9dd88def8 100644
--- a/doubleml/irm/qte.py
+++ b/doubleml/irm/qte.py
@@ -138,7 +138,6 @@ def __init__(self,
         self._trimming_threshold = trimming_threshold
         _check_trimming(self._trimming_rule, self._trimming_threshold)
 
-        self._check_quantile()
         if not isinstance(self.normalize_ipw, bool):
             raise TypeError('Normalization indicator has to be boolean. ' +
                             f'Object of type {str(type(self.normalize_ipw))} passed.')

From 1506474816ef516e88f0d310976f97a5a0a80535 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Fri, 21 Jun 2024 16:44:46 +0200
Subject: [PATCH 28/98] remove self_i_quant from qte

---
 doubleml/irm/qte.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/doubleml/irm/qte.py b/doubleml/irm/qte.py
index 9dd88def8..8609a31af 100644
--- a/doubleml/irm/qte.py
+++ b/doubleml/irm/qte.py
@@ -411,14 +411,13 @@ def fit(self, n_jobs_models=None, n_jobs_cv=None, store_predictions=True, store_
         framework_list = [None] * self.n_quantiles
 
         for i_quant in range(self.n_quantiles):
-            self._i_quant = i_quant
             # save the parallel fitted models in the right list
-            self._modellist_0[self._i_quant] = fitted_models[self._i_quant][0]
-            self._modellist_1[self._i_quant] = fitted_models[self._i_quant][1]
+            self._modellist_0[i_quant] = fitted_models[i_quant][0]
+            self._modellist_1[i_quant] = fitted_models[i_quant][1]
 
             # set up the framework
-            framework_list[self._i_quant] = self._modellist_1[self._i_quant].framework - \
-                self._modellist_0[self._i_quant].framework
+            framework_list[i_quant] = self._modellist_1[i_quant].framework - \
+                self._modellist_0[i_quant].framework
 
         # aggregate all frameworks
         self._framework = concat(framework_list)
@@ -558,7 +557,6 @@ def _initialize_models(self):
             'draw_sample_splitting': False
         }
         for i_quant in range(self.n_quantiles):
-            self._i_quant = i_quant
 
             # initialize models for both potential quantiles
             if self.score == 'PQ':

From 64a0e20bb63c386c85ce8fa6855f8b63b5058b12 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Fri, 21 Jun 2024 16:44:56 +0200
Subject: [PATCH 29/98] add first apos model

---
 doubleml/__init__.py                       |  40 +--
 doubleml/irm/apos.py                       | 293 +++++++++++++++++++++
 doubleml/irm/tests/test_apos_exceptions.py |  72 +++++
 3 files changed, 387 insertions(+), 18 deletions(-)
 create mode 100644 doubleml/irm/apos.py
 create mode 100644 doubleml/irm/tests/test_apos_exceptions.py

diff --git a/doubleml/__init__.py b/doubleml/__init__.py
index 69e064a00..c97bddf79 100644
--- a/doubleml/__init__.py
+++ b/doubleml/__init__.py
@@ -6,6 +6,7 @@
 from .plm.pliv import DoubleMLPLIV
 from .irm.irm import DoubleMLIRM
 from .irm.apo import DoubleMLAPO
+from .irm.apos import DoubleMLAPOS
 from .irm.iivm import DoubleMLIIVM
 from .double_ml_data import DoubleMLData, DoubleMLClusterData
 from .did.did import DoubleMLDID
@@ -19,23 +20,26 @@
 from .utils.blp import DoubleMLBLP
 from .utils.policytree import DoubleMLPolicyTree
 
-__all__ = ['concat',
-           'DoubleMLFramework',
-           'DoubleMLPLR',
-           'DoubleMLPLIV',
-           'DoubleMLIRM',
-           'DoubleMLAPO',
-           'DoubleMLIIVM',
-           'DoubleMLData',
-           'DoubleMLClusterData',
-           'DoubleMLDID',
-           'DoubleMLDIDCS',
-           'DoubleMLPQ',
-           'DoubleMLQTE',
-           'DoubleMLLPQ',
-           'DoubleMLCVAR',
-           'DoubleMLBLP',
-           'DoubleMLPolicyTree',
-           'DoubleMLSSM']
+__all__ = [
+    'concat',
+    'DoubleMLFramework',
+    'DoubleMLPLR',
+    'DoubleMLPLIV',
+    'DoubleMLIRM',
+    'DoubleMLAPO',
+    'DoubleMLAPOS',
+    'DoubleMLIIVM',
+    'DoubleMLData',
+    'DoubleMLClusterData',
+    'DoubleMLDID',
+    'DoubleMLDIDCS',
+    'DoubleMLPQ',
+    'DoubleMLQTE',
+    'DoubleMLLPQ',
+    'DoubleMLCVAR',
+    'DoubleMLBLP',
+    'DoubleMLPolicyTree',
+    'DoubleMLSSM'
+]
 
 __version__ = importlib.metadata.version('doubleml')
diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py
new file mode 100644
index 000000000..24768bf9d
--- /dev/null
+++ b/doubleml/irm/apos.py
@@ -0,0 +1,293 @@
+import numpy as np
+import pandas as pd
+
+from sklearn.base import clone
+
+from joblib import Parallel, delayed
+
+from ..double_ml_data import DoubleMLData, DoubleMLClusterData
+from .apo import DoubleMLAPO
+from ..double_ml_framework import concat
+
+from ..utils.resampling import DoubleMLResampling
+from ..utils._checks import _check_score, _check_trimming
+
+
+class DoubleMLAPOS:
+    """Double machine learning for interactive regression models with multiple discrete treatments.
+    """
+    def __init__(self,
+                 obj_dml_data,
+                 ml_g,
+                 ml_m,
+                 treatment_levels,
+                 n_folds=5,
+                 n_rep=1,   
+                 score='APO',
+                 weights=None,
+                 normalize_ipw=False,
+                 trimming_rule='truncate',
+                 trimming_threshold=1e-2,
+                 draw_sample_splitting=True):
+
+        self._dml_data = obj_dml_data
+        self._is_cluster_data = isinstance(obj_dml_data, DoubleMLClusterData)
+        self._check_data(self._dml_data)
+
+        self._treatment_levels = np.asarray(treatment_levels).reshape((-1, ))
+        self._check_treatment_levels()
+        self._n_levels = len(self._treatment_levels)
+
+        self._normalize_ipw = normalize_ipw
+        self._n_folds = n_folds
+        self._n_rep = n_rep
+
+        # check score
+        self._score = score
+        valid_scores = ['APO']
+        _check_score(self.score, valid_scores, allow_callable=False)
+
+        # initialize framework which is constructed after the fit method is called
+        self._framework = None
+
+        # initialize and check trimming
+        self._trimming_rule = trimming_rule
+        self._trimming_threshold = trimming_threshold
+        _check_trimming(self._trimming_rule, self._trimming_threshold)
+
+        if not isinstance(self.normalize_ipw, bool):
+            raise TypeError('Normalization indicator has to be boolean. ' +
+                            f'Object of type {str(type(self.normalize_ipw))} passed.')
+
+        # perform sample splitting
+        self._smpls = None
+        if draw_sample_splitting:
+            self.draw_sample_splitting()
+
+        self._learner = {'ml_g': clone(ml_g), 'ml_m': clone(ml_m)}
+        self._predict_method = {'ml_g': 'predict', 'ml_m': 'predict_proba'}
+
+        # initialize all models
+        self._modellist = self._initialize_models()
+
+    @property
+    def score(self):
+        """
+        The score function.
+        """
+        return self._score
+
+    @property
+    def n_levels(self):
+        """
+        The number of treatment levels.
+        """
+        return self._n_levels
+
+    @property
+    def normalize_ipw(self):
+        """
+        Indicates whether the inverse probability weights are normalized.
+        """
+        return self._normalize_ipw
+
+    @property
+    def trimming_rule(self):
+        """
+        Specifies the used trimming rule.
+        """
+        return self._trimming_rule
+
+    @property
+    def trimming_threshold(self):
+        """
+        Specifies the used trimming threshold.
+        """
+        return self._trimming_threshold
+
+    @property
+    def weights(self):
+        """
+        Specifies the weights for a weighted average potential outcome.
+        """
+        return self._weights
+
+    @property
+    def n_folds(self):
+        """
+        Number of folds.
+        """
+        return self._n_folds
+
+    @property
+    def n_rep(self):
+        """
+        Number of repetitions for the sample splitting.
+        """
+        return self._n_rep
+
+    @property
+    def coef(self):
+        """
+        Estimates for the causal parameter(s) after calling :meth:`fit` (shape (``n_quantiles``,)).
+        """
+        if self._framework is None:
+            coef = None
+        else:
+            coef = self.framework.thetas
+        return coef
+
+    @property
+    def framework(self):
+        """
+        The corresponding :class:`doubleml.DoubleMLFramework` object.
+        """
+        return self._framework
+
+    @property
+    def modellist(self):
+        """
+        The list of models for each level.
+        """
+        return self._modellist
+
+    def fit(self, n_jobs_models=None, n_jobs_cv=None, store_predictions=True, store_models=False, external_predictions=None):
+        """
+        Estimate DoubleMLAPOS models.
+
+        Parameters
+        ----------
+        n_jobs_models : None or int
+            The number of CPUs to use to fit the treatment_levels. ``None`` means ``1``.
+            Default is ``None``.
+
+        n_jobs_cv : None or int
+            The number of CPUs to use to fit the learners. ``None`` means ``1``.
+            Does not speed up computation for quantile models.
+            Default is ``None``.
+
+        store_predictions : bool
+            Indicates whether the predictions for the nuisance functions should be stored in ``predictions``.
+            Default is ``True``.
+
+        store_models : bool
+            Indicates whether the fitted models for the nuisance functions should be stored in ``models``. This allows
+            to analyze the fitted models or extract information like variable importance.
+            Default is ``False``.
+
+        Returns
+        -------
+        self : object
+        """
+
+        if external_predictions is not None:
+            raise NotImplementedError(f"External predictions not implemented for {self.__class__.__name__}.")
+
+        # parallel estimation of the quantiles
+        parallel = Parallel(n_jobs=n_jobs_models, verbose=0, pre_dispatch='2*n_jobs')
+        fitted_models = parallel(delayed(self._fit_model)(i_level, n_jobs_cv, store_predictions, store_models)
+                                 for i_level in range(self.n_treatment_levels))
+
+        # combine the estimates and scores
+        framework_list = [None] * self.n_levels
+
+        for i_level in range(self.n_levels):
+            self._modellist[i_level] = fitted_models[i_level][0]
+            framework_list[i_level] = self._modellist[i_level].framework
+
+        # aggregate all frameworks
+        self._framework = concat(framework_list)
+
+        return self
+
+    def confint(self, joint=False, level=0.95):
+        """
+        Confidence intervals for DoubleML models.
+
+        Parameters
+        ----------
+        joint : bool
+            Indicates whether joint confidence intervals are computed.
+            Default is ``False``
+
+        level : float
+            The confidence level.
+            Default is ``0.95``.
+
+        Returns
+        -------
+        df_ci : pd.DataFrame
+            A data frame with the confidence interval(s).
+        """
+
+        if self.framework is None:
+            raise ValueError('Apply fit() before confint().')
+
+        df_ci = self.framework.confint(joint=joint, level=level)
+        df_ci.set_index(pd.Index(self._treatment_levels), inplace=True)
+
+        return df_ci
+
+    def draw_sample_splitting(self):
+        """
+        Draw sample splitting for DoubleML models.
+
+        The samples are drawn according to the attributes
+        ``n_folds`` and ``n_rep``.
+
+        Returns
+        -------
+        self : object
+        """
+        obj_dml_resampling = DoubleMLResampling(n_folds=self.n_folds,
+                                                n_rep=self.n_rep,
+                                                n_obs=self._dml_data.n_obs,
+                                                stratify=self._dml_data.d)
+        self._smpls = obj_dml_resampling.split_samples()
+
+        return self
+
+    def _fit_model(self, i_level, n_jobs_cv=None, store_predictions=True, store_models=False):
+
+        model = self.modellist_0[i_level]
+        model.fit(n_jobs_cv=n_jobs_cv, store_predictions=store_predictions, store_models=store_models)
+        return model
+
+    def _check_treatment_levels(self):
+        if not np.all(np.isin(self._treatment_levels, np.unique(self._dml_data.d))):
+            raise ValueError('The treatment levels have to be a subset of the unique treatment levels in the data.')
+
+    def _check_data(self, obj_dml_data):
+        if not isinstance(obj_dml_data, DoubleMLData):
+            raise TypeError('The data must be of DoubleMLData or DoubleMLClusterData type.')
+        if obj_dml_data.z is not None:
+            raise ValueError('The data must not contain instrumental variables.')
+        return
+
+    def _initialize_models(self):
+        modellist = [None] * self.n_levels
+        kwargs = {
+            'obj_dml_data': self._dml_data,
+            'ml_g': self._learner['ml_g'],
+            'ml_m': self._learner['ml_m'],
+            'score': self.score,
+            'n_folds': self.n_folds,
+            'n_rep': self.n_rep,
+            'weights': self.weights,
+            'trimming_rule': self.trimming_rule,
+            'trimming_threshold': self.trimming_threshold,
+            'normalize_ipw': self.normalize_ipw,
+            'draw_sample_splitting': False
+        }
+        for i_level in range(self.n_levels):
+            # initialize models for all levels
+            model = DoubleMLAPO(
+                treatment_level=self._treatment_levels[i_level],
+                **kwargs
+            )
+
+            # synchronize the sample splitting
+            model.set_sample_splitting(all_smpls=self.smpls)
+            modellist[i_level] = model
+
+        return modellist
diff --git a/doubleml/irm/tests/test_apos_exceptions.py b/doubleml/irm/tests/test_apos_exceptions.py
new file mode 100644
index 000000000..9081a4e4a
--- /dev/null
+++ b/doubleml/irm/tests/test_apos_exceptions.py
@@ -0,0 +1,72 @@
+import pytest
+import pandas as pd
+import numpy as np
+
+from doubleml import DoubleMLAPOS, DoubleMLData
+from doubleml.datasets import make_irm_data_discrete_treatments, make_iivm_data
+
+from sklearn.linear_model import Lasso, LogisticRegression
+
+n = 100
+data = make_irm_data_discrete_treatments(n_obs=n)
+df = pd.DataFrame(
+    np.column_stack((data['y'], data['d'], data['x'])),
+    columns=['y', 'd'] + ['x' + str(i) for i in range(data['x'].shape[1])]
+)
+
+dml_data = DoubleMLData(df, 'y', 'd')
+
+ml_g = Lasso()
+ml_m = LogisticRegression()
+
+
+@pytest.mark.ci
+def test_apos_exception_data():
+    msg = 'The data must be of DoubleMLData or DoubleMLClusterData type.'
+    with pytest.raises(TypeError, match=msg):
+        _ = DoubleMLAPOS(pd.DataFrame(), ml_g, ml_m, treatment_levels=0)
+
+    msg = 'The data must not contain instrumental variables.'
+    with pytest.raises(ValueError, match=msg):
+        dml_data_z = make_iivm_data()
+        _ = DoubleMLAPOS(dml_data_z, ml_g, ml_m, treatment_levels=0)
+
+    msg = 'The treatment levels have to be a subset of the unique treatment levels in the data.'
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=[1.1])
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=1.1)
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=[1, 2.2])
+
+
+@pytest.mark.ci
+def test_apos_exception_scores():
+    msg = 'Invalid score MAR. Valid score APO.'
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=0, score='MAR')
+
+
+@pytest.mark.ci
+def test_apos_exception_trimming_rule():
+    msg = 'Invalid trimming_rule discard. Valid trimming_rule truncate.'
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=0, trimming_rule='discard')
+
+    # check the trimming_threshold exceptions
+    msg = "trimming_threshold has to be a float. Object of type <class 'str'> passed."
+    with pytest.raises(TypeError, match=msg):
+        _ = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=0,
+                         trimming_rule='truncate', trimming_threshold="0.1")
+
+    msg = 'Invalid trimming_threshold 0.6. trimming_threshold has to be between 0 and 0.5.'
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=0,
+                         trimming_rule='truncate', trimming_threshold=0.6)
+
+
+@pytest.mark.ci
+def test_apos_exception_ipw_normalization():
+    msg = "Normalization indicator has to be boolean. Object of type <class 'int'> passed."
+    with pytest.raises(TypeError, match=msg):
+        _ = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=0, normalize_ipw=1)

From f849bd034fae2e0817e5369986c23dcf27639794 Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Mon, 15 Jul 2024 20:31:24 +0200
Subject: [PATCH 30/98] update irm dgp

---
 doubleml/datasets.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/doubleml/datasets.py b/doubleml/datasets.py
index fd18affdc..b3f6d745d 100644
--- a/doubleml/datasets.py
+++ b/doubleml/datasets.py
@@ -1435,7 +1435,7 @@ def make_ssm_data(n_obs=8000, dim_x=100, theta=1, mar=True, return_type='DoubleM
         raise ValueError('Invalid return_type.')
 
 
-def make_irm_data_discrete_treatments(n_obs=200, n_levels=3, random_state=None, **kwargs):
+def make_irm_data_discrete_treatments(n_obs=200, n_levels=3, linear=False, random_state=None, **kwargs):
     """
     Generates data from a interactive regression (IRM) model with multiple treatment levels (based on an
     underlying continous treatment).
@@ -1536,10 +1536,10 @@ def f_treatment(w, xi):
         res = xi * (-w[:, 0] + 0.5*w[:, 1] - 0.25*w[:, 2] - 0.1*w[:, 3])
         return res
 
-    def treatment_effect(d):
-        return 0.1 * np.exp(d) + 10 * np.sin(0.7 * d) + 2 * d - 0.2 * np.square(d)
+    def treatment_effect(d, scale=5):
+        return scale * (1 / (1 + np.exp(-d - 1.2 * np.cos(d)))) - 2
 
-    z_tilde_1 = np.exp(0.5*x[:, 0])
+    z_tilde_1 = np.exp(0.5 * x[:, 0])
     z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0]))
     z_tilde_3 = (0.6 + x[:, 0] * x[:, 2]/25)**3
     z_tilde_4 = (20 + x[:, 1] + x[:, 3])**2
@@ -1553,16 +1553,24 @@ def treatment_effect(d):
     var_eps_d = 1
     eps_d = np.random.normal(loc=0, scale=np.sqrt(var_eps_d), size=n_obs)
 
-    cont_d = f_treatment(z, xi) + eps_d
+    if linear:
+        g = f_reg(x)
+        m = f_treatment(x, xi)
+    else:
+        assert not linear
+        g = f_reg(z)
+        m = f_treatment(z, xi)
+
+    cont_d = m + eps_d
     level_bounds = np.quantile(cont_d, q=np.linspace(0, 1, n_levels + 1))
     potential_level = sum([1.0 * (cont_d >= bound) for bound in level_bounds[1:-1]]) + 1
     eta = np.random.uniform(0, 1, size=n_obs)
-    observed_d = 1.0 * (eta >= 1/n_levels) * potential_level
+    d = 1.0 * (eta >= 1/n_levels) * potential_level
 
     ite = treatment_effect(cont_d)
-    y0 = f_reg(z) + eps_y
+    y0 = g + eps_y
     # only treated for d > 0 compared to the baseline
-    y = ite * (observed_d > 0) + y0
+    y = ite * (d > 0) + y0
 
     oracle_values = {
         'cont_d': cont_d,
@@ -1575,7 +1583,7 @@ def treatment_effect(d):
     resul_dict = {
         'x': x,
         'y': y,
-        'd': observed_d,
+        'd': d,
         'oracle_values': oracle_values
     }
 

From ec246689c1ba965ab0b09ba849a368c36d3e1025 Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Mon, 15 Jul 2024 20:31:28 +0200
Subject: [PATCH 31/98] Update apo.py

---
 doubleml/irm/apo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doubleml/irm/apo.py b/doubleml/irm/apo.py
index 423f3fa7e..838e55460 100644
--- a/doubleml/irm/apo.py
+++ b/doubleml/irm/apo.py
@@ -133,7 +133,7 @@ def __init__(self,
         self._sensitivity_implemented = True
         self._external_predictions_implemented = True
 
-        # ATE weights are the standard case
+        # APO weights
         _check_weights(weights, score="ATE", n_obs=obj_dml_data.n_obs, n_rep=self.n_rep)
         self._initialize_weights(weights)
 

From 58f2b3962779cc0ff875fe5c4faf4b0b7b057e9e Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Mon, 15 Jul 2024 20:31:32 +0200
Subject: [PATCH 32/98] Update apos.py

---
 doubleml/irm/apos.py | 46 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py
index 24768bf9d..fb4b7aeeb 100644
--- a/doubleml/irm/apos.py
+++ b/doubleml/irm/apos.py
@@ -10,7 +10,7 @@
 from ..double_ml_framework import concat
 
 from ..utils.resampling import DoubleMLResampling
-from ..utils._checks import _check_score, _check_trimming
+from ..utils._checks import _check_score, _check_trimming, _check_weights
 
 
 class DoubleMLAPOS:
@@ -22,7 +22,7 @@ def __init__(self,
                  ml_m,
                  treatment_levels,
                  n_folds=5,
-                 n_rep=1,   
+                 n_rep=1,
                  score='APO',
                  weights=None,
                  normalize_ipw=False,
@@ -36,7 +36,7 @@ def __init__(self,
 
         self._treatment_levels = np.asarray(treatment_levels).reshape((-1, ))
         self._check_treatment_levels()
-        self._n_levels = len(self._treatment_levels)
+        self._n_treatment_levels = len(self._treatment_levels)
 
         self._normalize_ipw = normalize_ipw
         self._n_folds = n_folds
@@ -67,6 +67,10 @@ def __init__(self,
         self._learner = {'ml_g': clone(ml_g), 'ml_m': clone(ml_m)}
         self._predict_method = {'ml_g': 'predict', 'ml_m': 'predict_proba'}
 
+        # APO weights
+        _check_weights(weights, score="ATE", n_obs=obj_dml_data.n_obs, n_rep=self.n_rep)
+        self._initialize_weights(weights)
+
         # initialize all models
         self._modellist = self._initialize_models()
 
@@ -78,11 +82,11 @@ def score(self):
         return self._score
 
     @property
-    def n_levels(self):
+    def n_treatment_levels(self):
         """
         The number of treatment levels.
         """
-        return self._n_levels
+        return self._n_treatment_levels
 
     @property
     def normalize_ipw(self):
@@ -137,6 +141,17 @@ def coef(self):
             coef = self.framework.thetas
         return coef
 
+    @property
+    def smpls(self):
+        """
+        The partition used for cross-fitting.
+        """
+        if self._smpls is None:
+            err_msg = ('Sample splitting not specified. Draw samples via .draw_sample splitting(). ' +
+                       'External samples not implemented yet.')
+            raise ValueError(err_msg)
+        return self._smpls
+
     @property
     def framework(self):
         """
@@ -189,10 +204,10 @@ def fit(self, n_jobs_models=None, n_jobs_cv=None, store_predictions=True, store_
                                  for i_level in range(self.n_treatment_levels))
 
         # combine the estimates and scores
-        framework_list = [None] * self.n_levels
+        framework_list = [None] * self.n_treatment_levels
 
-        for i_level in range(self.n_levels):
-            self._modellist[i_level] = fitted_models[i_level][0]
+        for i_level in range(self.n_treatment_levels):
+            self._modellist[i_level] = fitted_models[i_level]
             framework_list[i_level] = self._modellist[i_level].framework
 
         # aggregate all frameworks
@@ -249,7 +264,7 @@ def draw_sample_splitting(self):
 
     def _fit_model(self, i_level, n_jobs_cv=None, store_predictions=True, store_models=False):
 
-        model = self.modellist_0[i_level]
+        model = self.modellist[i_level]
         model.fit(n_jobs_cv=n_jobs_cv, store_predictions=store_predictions, store_models=store_models)
         return model
 
@@ -264,8 +279,17 @@ def _check_data(self, obj_dml_data):
             raise ValueError('The data must not contain instrumental variables.')
         return
 
+    def _initialize_weights(self, weights):
+        if weights is None:
+            weights = np.ones(self._dml_data.n_obs)
+        if isinstance(weights, np.ndarray):
+            self._weights = weights
+        else:
+            assert isinstance(weights, dict)
+            self._weights = weights
+
     def _initialize_models(self):
-        modellist = [None] * self.n_levels
+        modellist = [None] * self.n_treatment_levels
         kwargs = {
             'obj_dml_data': self._dml_data,
             'ml_g': self._learner['ml_g'],
@@ -279,7 +303,7 @@ def _initialize_models(self):
             'normalize_ipw': self.normalize_ipw,
             'draw_sample_splitting': False
         }
-        for i_level in range(self.n_levels):
+        for i_level in range(self.n_treatment_levels):
             # initialize models for all levels
             model = DoubleMLAPO(
                 treatment_level=self._treatment_levels[i_level],

From 9ab05aa8ea92df2ff1d65c4bff532606f8431336 Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Wed, 17 Jul 2024 20:04:31 +0200
Subject: [PATCH 33/98] update set sample splitting documentation

---
 doubleml/double_ml.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py
index 46e18825f..a48dbbc13 100644
--- a/doubleml/double_ml.py
+++ b/doubleml/double_ml.py
@@ -1188,9 +1188,6 @@ def set_sample_splitting(self, all_smpls):
         >>> ml_m = learner
         >>> obj_dml_data = make_plr_CCDDHNR2018(n_obs=10, alpha=0.5)
         >>> dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m)
-        >>> # simple sample splitting with two folds and without cross-fitting
-        >>> smpls = ([0, 1, 2, 3, 4], [5, 6, 7, 8, 9])
-        >>> dml_plr_obj.set_sample_splitting(smpls)
         >>> # sample splitting with two folds and cross-fitting
         >>> smpls = [([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]),
         >>>          ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])]

From b3f4f77d86b8fcc28f1e02b496ad0517c069f7d1 Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Wed, 17 Jul 2024 20:47:15 +0200
Subject: [PATCH 34/98] add set_sample_slit to apos.py

---
 doubleml/irm/apos.py | 164 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 160 insertions(+), 4 deletions(-)

diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py
index fb4b7aeeb..d291423dc 100644
--- a/doubleml/irm/apos.py
+++ b/doubleml/irm/apos.py
@@ -10,7 +10,8 @@
 from ..double_ml_framework import concat
 
 from ..utils.resampling import DoubleMLResampling
-from ..utils._checks import _check_score, _check_trimming, _check_weights
+from ..utils._checks import _check_score, _check_trimming, _check_weights, _check_is_partition, \
+    _check_smpl_split_tpl, _check_smpl_split, _check_all_smpls
 
 
 class DoubleMLAPOS:
@@ -64,6 +65,9 @@ def __init__(self,
         if draw_sample_splitting:
             self.draw_sample_splitting()
 
+            # initialize all models if splits are known
+            self._modellist = self._initialize_models()
+
         self._learner = {'ml_g': clone(ml_g), 'ml_m': clone(ml_m)}
         self._predict_method = {'ml_g': 'predict', 'ml_m': 'predict_proba'}
 
@@ -71,8 +75,6 @@ def __init__(self,
         _check_weights(weights, score="ATE", n_obs=obj_dml_data.n_obs, n_rep=self.n_rep)
         self._initialize_weights(weights)
 
-        # initialize all models
-        self._modellist = self._initialize_models()
 
     @property
     def score(self):
@@ -133,7 +135,7 @@ def n_rep(self):
     @property
     def coef(self):
         """
-        Estimates for the causal parameter(s) after calling :meth:`fit` (shape (``n_quantiles``,)).
+        Estimates for the causal parameter(s) after calling :meth:`fit` (shape (``n_treatment_levels``,)).
         """
         if self._framework is None:
             coef = None
@@ -141,6 +143,41 @@ def coef(self):
             coef = self.framework.thetas
         return coef
 
+    @property
+    def all_coef(self):
+        """
+        Estimates of the causal parameter(s) for the ``n_rep`` different sample splits after calling :meth:`fit`
+         (shape (``n_treatment_levels``, ``n_rep``)).
+        """
+        if self._framework is None:
+            all_coef = None
+        else:
+            all_coef = self.framework.all_thetas
+        return all_coef
+
+    @property
+    def se(self):
+        """
+        Standard errors for the causal parameter(s) after calling :meth:`fit` (shape (``n_treatment_levels``,)).
+        """
+        if self._framework is None:
+            se = None
+        else:
+            se = self.framework.ses
+        return se
+
+    @property
+    def all_se(self):
+        """
+        Standard errors of the causal parameter(s) for the ``n_rep`` different sample splits after calling :meth:`fit`
+         (shape (``n_treatment_levels``, ``n_rep``)).
+        """
+        if self._framework is None:
+            all_se = None
+        else:
+            all_se = self.framework.all_ses
+        return all_se
+
     @property
     def smpls(self):
         """
@@ -262,6 +299,125 @@ def draw_sample_splitting(self):
 
         return self
 
+    def set_sample_splitting(self, all_smpls):
+        """
+        Set the sample splitting for DoubleML models.
+
+        The  attributes ``n_folds`` and ``n_rep`` are derived from the provided partition.
+
+        Parameters
+        ----------
+        all_smpls : list or tuple
+            If nested list of lists of tuples:
+                The outer list needs to provide an entry per repeated sample splitting (length of list is set as
+                ``n_rep``).
+                The inner list needs to provide a tuple (train_ind, test_ind) per fold (length of list is set as
+                ``n_folds``). test_ind must form a partition for each inner list.
+            If list of tuples:
+                The list needs to provide a tuple (train_ind, test_ind) per fold (length of list is set as
+                ``n_folds``). test_ind must form a partition. ``n_rep=1`` is always set.
+            If tuple:
+                Must be a tuple with two elements train_ind and test_ind. Only viable option is to set
+                train_ind and test_ind to np.arange(n_obs), which corresponds to no sample splitting.
+                ``n_folds=1`` and ``n_rep=1`` is always set.
+
+        Returns
+        -------
+        self : object
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> import doubleml as dml
+        >>> from doubleml.datasets import make_plr_CCDDHNR2018
+        >>> from sklearn.ensemble import RandomForestRegressor
+        >>> from sklearn.base import clone
+        >>> np.random.seed(3141)
+        >>> learner = RandomForestRegressor(max_depth=2, n_estimators=10)
+        >>> ml_g = learner
+        >>> ml_m = learner
+        >>> obj_dml_data = make_plr_CCDDHNR2018(n_obs=10, alpha=0.5)
+        >>> dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m)
+        >>> # sample splitting with two folds and cross-fitting
+        >>> smpls = [([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]),
+        >>>          ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])]
+        >>> dml_plr_obj.set_sample_splitting(smpls)
+        >>> # sample splitting with two folds and repeated cross-fitting with n_rep = 2
+        >>> smpls = [[([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]),
+        >>>           ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])],
+        >>>          [([0, 2, 4, 6, 8], [1, 3, 5, 7, 9]),
+        >>>           ([1, 3, 5, 7, 9], [0, 2, 4, 6, 8])]]
+        >>> dml_plr_obj.set_sample_splitting(smpls)
+        """
+        if self._is_cluster_data:
+            raise NotImplementedError('Externally setting the sample splitting for DoubleML is '
+                                      'not yet implemented with clustering.')
+        if isinstance(all_smpls, tuple):
+            if not len(all_smpls) == 2:
+                raise ValueError('Invalid partition provided. '
+                                 'Tuple for train_ind and test_ind must consist of exactly two elements.')
+            all_smpls = _check_smpl_split_tpl(all_smpls, self._dml_data.n_obs)
+            if (_check_is_partition([all_smpls], self._dml_data.n_obs) &
+                    _check_is_partition([(all_smpls[1], all_smpls[0])], self._dml_data.n_obs)):
+                self._n_rep = 1
+                self._n_folds = 1
+                self._smpls = [[all_smpls]]
+            else:
+                raise ValueError('Invalid partition provided. '
+                                 'Tuple provided that doesn\'t form a partition.')
+        else:
+            if not isinstance(all_smpls, list):
+                raise TypeError('all_smpls must be of list or tuple type. '
+                                f'{str(all_smpls)} of type {str(type(all_smpls))} was passed.')
+            all_tuple = all([isinstance(tpl, tuple) for tpl in all_smpls])
+            if all_tuple:
+                if not all([len(tpl) == 2 for tpl in all_smpls]):
+                    raise ValueError('Invalid partition provided. '
+                                     'All tuples for train_ind and test_ind must consist of exactly two elements.')
+                self._n_rep = 1
+                all_smpls = _check_smpl_split(all_smpls, self._dml_data.n_obs)
+                if _check_is_partition(all_smpls, self._dml_data.n_obs):
+                    if ((len(all_smpls) == 1) &
+                            _check_is_partition([(all_smpls[0][1], all_smpls[0][0])], self._dml_data.n_obs)):
+                        self._n_folds = 1
+                        self._smpls = [all_smpls]
+                    else:
+                        self._n_folds = len(all_smpls)
+                        self._smpls = _check_all_smpls([all_smpls], self._dml_data.n_obs, check_intersect=True)
+                else:
+                    raise ValueError('Invalid partition provided. '
+                                     'Tuples provided that don\'t form a partition.')
+            else:
+                all_list = all([isinstance(smpl, list) for smpl in all_smpls])
+                if not all_list:
+                    raise ValueError('Invalid partition provided. '
+                                     'all_smpls is a list where neither all elements are tuples '
+                                     'nor all elements are lists.')
+                all_tuple = all([all([isinstance(tpl, tuple) for tpl in smpl]) for smpl in all_smpls])
+                if not all_tuple:
+                    raise TypeError('For repeated sample splitting all_smpls must be list of lists of tuples.')
+                all_pairs = all([all([len(tpl) == 2 for tpl in smpl]) for smpl in all_smpls])
+                if not all_pairs:
+                    raise ValueError('Invalid partition provided. '
+                                     'All tuples for train_ind and test_ind must consist of exactly two elements.')
+                n_folds_each_smpl = np.array([len(smpl) for smpl in all_smpls])
+                if not np.all(n_folds_each_smpl == n_folds_each_smpl[0]):
+                    raise ValueError('Invalid partition provided. '
+                                     'Different number of folds for repeated sample splitting.')
+                all_smpls = _check_all_smpls(all_smpls, self._dml_data.n_obs)
+                smpls_are_partitions = [_check_is_partition(smpl, self._dml_data.n_obs) for smpl in all_smpls]
+
+                if all(smpls_are_partitions):
+                    self._n_rep = len(all_smpls)
+                    self._n_folds = int(n_folds_each_smpl[0])
+                    self._smpls = _check_all_smpls(all_smpls, self._dml_data.n_obs, check_intersect=True)
+                else:
+                    raise ValueError('Invalid partition provided. '
+                                     'At least one inner list does not form a partition.')
+        self._modellist = self._initialize_models()
+
+        return self
+
     def _fit_model(self, i_level, n_jobs_cv=None, store_predictions=True, store_models=False):
 
         model = self.modellist[i_level]

From d2ab51244fc07006c5bbb81547e5049621e8f251 Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Wed, 17 Jul 2024 20:47:29 +0200
Subject: [PATCH 35/98] create manual apos version and basic unit test

---
 doubleml/irm/tests/_utils_apos_manual.py |  59 ++++++++++++
 doubleml/irm/tests/test_apos.py          | 113 +++++++++++++++++++++++
 2 files changed, 172 insertions(+)
 create mode 100644 doubleml/irm/tests/_utils_apos_manual.py
 create mode 100644 doubleml/irm/tests/test_apos.py

diff --git a/doubleml/irm/tests/_utils_apos_manual.py b/doubleml/irm/tests/_utils_apos_manual.py
new file mode 100644
index 000000000..9356ec815
--- /dev/null
+++ b/doubleml/irm/tests/_utils_apos_manual.py
@@ -0,0 +1,59 @@
+import numpy as np
+from sklearn.base import clone
+
+from ..apo import DoubleMLAPO
+from ...double_ml_data import DoubleMLData
+
+
+def fit_apos(y, x, d,
+             learner_g, learner_m, treatment_levels, all_smpls, score,
+             n_rep=1, trimming_rule='truncate',
+             normalize_ipw=False, trimming_threshold=1e-2):
+    n_obs = len(y)
+    n_treatments = len(treatment_levels)
+    n_folds = len(all_smpls[0])
+
+    dml_data = DoubleMLData.from_arrays(x, y, d)
+
+    all_apos = np.zeros((n_treatments, n_rep))
+    all_se = np.zeros((n_treatments, n_rep))
+    apo_scaled_score = np.zeros((n_obs, n_treatments, n_rep))
+
+    for i_level in range(n_treatments):
+        model_APO = DoubleMLAPO(
+            dml_data,
+            clone(learner_g),
+            clone(learner_m),
+            treatment_level=treatment_levels[i_level],
+            n_folds=n_folds,
+            n_rep=n_rep,
+            score=score,
+            trimming_rule=trimming_rule,
+            trimming_threshold=trimming_threshold,
+            normalize_ipw=normalize_ipw,
+            draw_sample_splitting=False
+        )
+
+        # synchronize the sample splitting
+        model_APO.set_sample_splitting(all_smpls)
+        model_APO.fit()
+
+        all_apos[i_level, :] = model_APO.all_coef
+        all_se[i_level, :] = model_APO.all_se
+
+        for i_rep in range(n_rep):
+            J = model_APO.psi_deriv[:, i_rep, 0].mean()
+            apo_psi = model_APO.psi[:, i_rep, 0]
+
+            apo_scaled_score[:, i_level, i_rep] = apo_psi / J
+
+        apos = np.median(all_apos, axis=1)
+        se = np.zeros(n_treatments)
+        for i_level in range(n_treatments):
+            se[i_level] = np.sqrt(np.median(np.power(all_se[i_level, :], 2) * n_obs +
+                                            np.power(all_apos[i_level, :] - all_apos[i_level], 2)) / n_obs)
+
+    res = {'apos': apos, 'se': se,
+           'all_apos': all_apos, 'all_se': all_se,
+           'apo_scaled_score': apo_scaled_score}
+    return res
diff --git a/doubleml/irm/tests/test_apos.py b/doubleml/irm/tests/test_apos.py
new file mode 100644
index 000000000..1881b39da
--- /dev/null
+++ b/doubleml/irm/tests/test_apos.py
@@ -0,0 +1,113 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from sklearn.base import clone
+
+from sklearn.linear_model import LogisticRegression, LinearRegression
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+
+import doubleml as dml
+from doubleml.datasets import make_irm_data_discrete_treatments
+
+from ...tests._utils import draw_smpls
+from ._utils_apos_manual import fit_apos
+
+
+@pytest.fixture(scope='module',
+                params=[[LinearRegression(),
+                         LogisticRegression(solver='lbfgs', max_iter=250, random_state=42)],
+                        [RandomForestRegressor(max_depth=5, n_estimators=10, random_state=42),
+                         RandomForestClassifier(max_depth=5, n_estimators=10, random_state=42)]])
+def learner(request):
+    return request.param
+
+
+@pytest.fixture(scope='module',
+                params=[1])
+def n_rep(request):
+    return request.param
+
+
+@pytest.fixture(scope='module',
+                params=[False, True])
+def normalize_ipw(request):
+    return request.param
+
+
+@pytest.fixture(scope='module',
+                params=[0.2, 0.15])
+def trimming_threshold(request):
+    return request.param
+
+
+@pytest.fixture(scope='module',
+                params=[[0, 1, 2], [0]])
+def treatment_levels(request):
+    return request.param
+
+
+@pytest.fixture(scope='module')
+def dml_apos_fixture(generate_data_irm, learner, n_rep, normalize_ipw, trimming_threshold, treatment_levels):
+    boot_methods = ['normal']
+    n_folds = 2
+    n_rep_boot = 499
+
+    # Set machine learning methods for m & g
+    ml_g = clone(learner[0])
+    ml_m = clone(learner[1])
+
+    np.random.seed(3141)
+    n_obs = 500
+    data = make_irm_data_discrete_treatments(n_obs=n_obs)
+    y = data['y']
+    x = data['x']
+    d = data['d']
+    df = pd.DataFrame(
+        np.column_stack((y, d, x)),
+        columns=['y', 'd'] + ['x' + str(i) for i in range(data['x'].shape[1])]
+    )
+
+    dml_data = dml.DoubleMLData(df, 'y', 'd')
+    all_smpls = draw_smpls(n_obs, n_folds, n_rep=1, groups=d)
+
+    np.random.seed(3141)
+    dml_obj = dml.DoubleMLAPOS(
+        dml_data,
+        ml_g, ml_m,
+        treatment_levels=treatment_levels,
+        n_folds=n_folds,
+        n_rep=n_rep,
+        score='APO',
+        normalize_ipw=normalize_ipw,
+        trimming_rule='truncate',
+        trimming_threshold=trimming_threshold,
+        draw_sample_splitting=False)
+
+    # synchronize the sample splitting
+    dml_obj.set_sample_splitting(all_smpls)
+    dml_obj.fit()
+
+    np.random.seed(3141)
+    res_manual = fit_apos(
+        y, x, d,
+        clone(learner[0]), clone(learner[1]),
+        treatment_levels=treatment_levels,
+        all_smpls=all_smpls,
+        score='APO',
+        trimming_rule='truncate',
+        normalize_ipw=normalize_ipw,
+        trimming_threshold=trimming_threshold)
+
+    res_dict = {'coef': dml_obj.coef,
+                'coef_manual': res_manual['apos'],
+                'se': dml_obj.se,
+                'se_manual': res_manual['se']}
+    return res_dict
+
+
+@pytest.mark.ci
+def test_dml_apos_coef(dml_apos_fixture):
+    assert np.allclose(dml_apos_fixture['coef'],
+                       dml_apos_fixture['coef_manual'],
+                       rtol=1e-9, atol=1e-9)

From e6d680cde78deb304d18c1e85c0c3453c3608591 Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Wed, 17 Jul 2024 20:52:03 +0200
Subject: [PATCH 36/98] Update _utils_apos_manual.py

---
 doubleml/irm/tests/_utils_apos_manual.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/doubleml/irm/tests/_utils_apos_manual.py b/doubleml/irm/tests/_utils_apos_manual.py
index 9356ec815..db5f41c55 100644
--- a/doubleml/irm/tests/_utils_apos_manual.py
+++ b/doubleml/irm/tests/_utils_apos_manual.py
@@ -47,11 +47,11 @@ def fit_apos(y, x, d,
 
             apo_scaled_score[:, i_level, i_rep] = apo_psi / J
 
-        apos = np.median(all_apos, axis=1)
-        se = np.zeros(n_treatments)
-        for i_level in range(n_treatments):
-            se[i_level] = np.sqrt(np.median(np.power(all_se[i_level, :], 2) * n_obs +
-                                            np.power(all_apos[i_level, :] - all_apos[i_level], 2)) / n_obs)
+    apos = np.median(all_apos, axis=1)
+    se = np.zeros(n_treatments)
+    for i_level in range(n_treatments):
+        se[i_level] = np.sqrt(np.median(np.power(all_se[i_level, :], 2) * n_obs +
+                                        np.power(all_apos[i_level, :] - all_apos[i_level], 2)) / n_obs)
 
     res = {'apos': apos, 'se': se,
            'all_apos': all_apos, 'all_se': all_se,

From 2adc6d40157c662ce0f89bd641a71f43853e8bbe Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Mon, 22 Jul 2024 13:42:11 +0200
Subject: [PATCH 37/98] update set_sample_splitting in apos.py

---
 doubleml/irm/apos.py            | 91 ++++++---------------------------
 doubleml/irm/tests/test_apos.py | 43 +++++++++-------
 2 files changed, 40 insertions(+), 94 deletions(-)

diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py
index d291423dc..735fda63b 100644
--- a/doubleml/irm/apos.py
+++ b/doubleml/irm/apos.py
@@ -10,8 +10,7 @@
 from ..double_ml_framework import concat
 
 from ..utils.resampling import DoubleMLResampling
-from ..utils._checks import _check_score, _check_trimming, _check_weights, _check_is_partition, \
-    _check_smpl_split_tpl, _check_smpl_split, _check_all_smpls
+from ..utils._checks import _check_score, _check_trimming, _check_weights, _check_sample_splitting
 
 
 class DoubleMLAPOS:
@@ -60,14 +59,6 @@ def __init__(self,
             raise TypeError('Normalization indicator has to be boolean. ' +
                             f'Object of type {str(type(self.normalize_ipw))} passed.')
 
-        # perform sample splitting
-        self._smpls = None
-        if draw_sample_splitting:
-            self.draw_sample_splitting()
-
-            # initialize all models if splits are known
-            self._modellist = self._initialize_models()
-
         self._learner = {'ml_g': clone(ml_g), 'ml_m': clone(ml_m)}
         self._predict_method = {'ml_g': 'predict', 'ml_m': 'predict_proba'}
 
@@ -75,6 +66,13 @@ def __init__(self,
         _check_weights(weights, score="ATE", n_obs=obj_dml_data.n_obs, n_rep=self.n_rep)
         self._initialize_weights(weights)
 
+        # perform sample splitting
+        self._smpls = None
+        if draw_sample_splitting:
+            self.draw_sample_splitting()
+
+            # initialize all models if splits are known
+            self._modellist = self._initialize_models()
 
     @property
     def score(self):
@@ -227,6 +225,9 @@ def fit(self, n_jobs_models=None, n_jobs_cv=None, store_predictions=True, store_
             to analyze the fitted models or extract information like variable importance.
             Default is ``False``.
 
+        external_predictions : None
+            Not implemented for DoubleMLAPOS.
+
         Returns
         -------
         self : object
@@ -299,7 +300,7 @@ def draw_sample_splitting(self):
 
         return self
 
-    def set_sample_splitting(self, all_smpls):
+    def set_sample_splitting(self, all_smpls, all_smpls_cluster=None):
         """
         Set the sample splitting for DoubleML models.
 
@@ -349,71 +350,9 @@ def set_sample_splitting(self, all_smpls):
         >>>           ([1, 3, 5, 7, 9], [0, 2, 4, 6, 8])]]
         >>> dml_plr_obj.set_sample_splitting(smpls)
         """
-        if self._is_cluster_data:
-            raise NotImplementedError('Externally setting the sample splitting for DoubleML is '
-                                      'not yet implemented with clustering.')
-        if isinstance(all_smpls, tuple):
-            if not len(all_smpls) == 2:
-                raise ValueError('Invalid partition provided. '
-                                 'Tuple for train_ind and test_ind must consist of exactly two elements.')
-            all_smpls = _check_smpl_split_tpl(all_smpls, self._dml_data.n_obs)
-            if (_check_is_partition([all_smpls], self._dml_data.n_obs) &
-                    _check_is_partition([(all_smpls[1], all_smpls[0])], self._dml_data.n_obs)):
-                self._n_rep = 1
-                self._n_folds = 1
-                self._smpls = [[all_smpls]]
-            else:
-                raise ValueError('Invalid partition provided. '
-                                 'Tuple provided that doesn\'t form a partition.')
-        else:
-            if not isinstance(all_smpls, list):
-                raise TypeError('all_smpls must be of list or tuple type. '
-                                f'{str(all_smpls)} of type {str(type(all_smpls))} was passed.')
-            all_tuple = all([isinstance(tpl, tuple) for tpl in all_smpls])
-            if all_tuple:
-                if not all([len(tpl) == 2 for tpl in all_smpls]):
-                    raise ValueError('Invalid partition provided. '
-                                     'All tuples for train_ind and test_ind must consist of exactly two elements.')
-                self._n_rep = 1
-                all_smpls = _check_smpl_split(all_smpls, self._dml_data.n_obs)
-                if _check_is_partition(all_smpls, self._dml_data.n_obs):
-                    if ((len(all_smpls) == 1) &
-                            _check_is_partition([(all_smpls[0][1], all_smpls[0][0])], self._dml_data.n_obs)):
-                        self._n_folds = 1
-                        self._smpls = [all_smpls]
-                    else:
-                        self._n_folds = len(all_smpls)
-                        self._smpls = _check_all_smpls([all_smpls], self._dml_data.n_obs, check_intersect=True)
-                else:
-                    raise ValueError('Invalid partition provided. '
-                                     'Tuples provided that don\'t form a partition.')
-            else:
-                all_list = all([isinstance(smpl, list) for smpl in all_smpls])
-                if not all_list:
-                    raise ValueError('Invalid partition provided. '
-                                     'all_smpls is a list where neither all elements are tuples '
-                                     'nor all elements are lists.')
-                all_tuple = all([all([isinstance(tpl, tuple) for tpl in smpl]) for smpl in all_smpls])
-                if not all_tuple:
-                    raise TypeError('For repeated sample splitting all_smpls must be list of lists of tuples.')
-                all_pairs = all([all([len(tpl) == 2 for tpl in smpl]) for smpl in all_smpls])
-                if not all_pairs:
-                    raise ValueError('Invalid partition provided. '
-                                     'All tuples for train_ind and test_ind must consist of exactly two elements.')
-                n_folds_each_smpl = np.array([len(smpl) for smpl in all_smpls])
-                if not np.all(n_folds_each_smpl == n_folds_each_smpl[0]):
-                    raise ValueError('Invalid partition provided. '
-                                     'Different number of folds for repeated sample splitting.')
-                all_smpls = _check_all_smpls(all_smpls, self._dml_data.n_obs)
-                smpls_are_partitions = [_check_is_partition(smpl, self._dml_data.n_obs) for smpl in all_smpls]
-
-                if all(smpls_are_partitions):
-                    self._n_rep = len(all_smpls)
-                    self._n_folds = int(n_folds_each_smpl[0])
-                    self._smpls = _check_all_smpls(all_smpls, self._dml_data.n_obs, check_intersect=True)
-                else:
-                    raise ValueError('Invalid partition provided. '
-                                     'At least one inner list does not form a partition.')
+        self._smpls, self._smpls_cluster, self._n_rep, self._n_folds = _check_sample_splitting(
+            all_smpls, all_smpls_cluster, self._dml_data, self._is_cluster_data)
+
         self._modellist = self._initialize_models()
 
         return self
diff --git a/doubleml/irm/tests/test_apos.py b/doubleml/irm/tests/test_apos.py
index 1881b39da..55d5252e0 100644
--- a/doubleml/irm/tests/test_apos.py
+++ b/doubleml/irm/tests/test_apos.py
@@ -10,7 +10,6 @@
 import doubleml as dml
 from doubleml.datasets import make_irm_data_discrete_treatments
 
-from ...tests._utils import draw_smpls
 from ._utils_apos_manual import fit_apos
 
 
@@ -69,26 +68,29 @@ def dml_apos_fixture(generate_data_irm, learner, n_rep, normalize_ipw, trimming_
     )
 
     dml_data = dml.DoubleMLData(df, 'y', 'd')
-    all_smpls = draw_smpls(n_obs, n_folds, n_rep=1, groups=d)
 
-    np.random.seed(3141)
-    dml_obj = dml.DoubleMLAPOS(
-        dml_data,
-        ml_g, ml_m,
-        treatment_levels=treatment_levels,
-        n_folds=n_folds,
-        n_rep=n_rep,
-        score='APO',
-        normalize_ipw=normalize_ipw,
-        trimming_rule='truncate',
-        trimming_threshold=trimming_threshold,
-        draw_sample_splitting=False)
-
-    # synchronize the sample splitting
-    dml_obj.set_sample_splitting(all_smpls)
+    input_args = {
+        "treatment_levels": treatment_levels,
+        "n_folds": n_folds,
+        "n_rep": n_rep,
+        "score": 'APO',
+        "normalize_ipw": normalize_ipw,
+        "trimming_rule": 'truncate',
+        "trimming_threshold": trimming_threshold,
+    }
+
+    np.random.seed(42)
+    dml_obj = dml.DoubleMLAPOS(dml_data, ml_g, ml_m, **input_args)
     dml_obj.fit()
+    # get the sample splitting
+    all_smpls = dml_obj.smpls
 
-    np.random.seed(3141)
+    np.random.seed(42)
+    dml_obj_ext_smpls = dml.DoubleMLAPOS(dml_data, ml_g, ml_m, **input_args, draw_sample_splitting=False)
+    dml_obj_ext_smpls.set_sample_splitting(dml_obj.smpls)
+    dml_obj_ext_smpls.fit()
+
+    np.random.seed(42)
     res_manual = fit_apos(
         y, x, d,
         clone(learner[0]), clone(learner[1]),
@@ -100,8 +102,10 @@ def dml_apos_fixture(generate_data_irm, learner, n_rep, normalize_ipw, trimming_
         trimming_threshold=trimming_threshold)
 
     res_dict = {'coef': dml_obj.coef,
+                'coef_ext_smpls': dml_obj_ext_smpls.coef,
                 'coef_manual': res_manual['apos'],
                 'se': dml_obj.se,
+                'se_ext_smpls': dml_obj_ext_smpls.se,
                 'se_manual': res_manual['se']}
     return res_dict
 
@@ -111,3 +115,6 @@ def test_dml_apos_coef(dml_apos_fixture):
     assert np.allclose(dml_apos_fixture['coef'],
                        dml_apos_fixture['coef_manual'],
                        rtol=1e-9, atol=1e-9)
+    assert np.allclose(dml_apos_fixture['coef'],
+                       dml_apos_fixture['coef_ext_smpls'],
+                       rtol=1e-9, atol=1e-9)

From 29f67c4297067659eee3ee58cf230af7dfbd978e Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Mon, 22 Jul 2024 13:54:27 +0200
Subject: [PATCH 38/98] create manual confint version for qte and apos

---
 doubleml/irm/tests/_utils_qte_manual.py | 20 --------------------
 doubleml/irm/tests/test_qte.py          | 12 ++++++------
 doubleml/tests/_utils.py                | 20 ++++++++++++++++++++
 3 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/doubleml/irm/tests/_utils_qte_manual.py b/doubleml/irm/tests/_utils_qte_manual.py
index ccaf96ee0..5c177907c 100644
--- a/doubleml/irm/tests/_utils_qte_manual.py
+++ b/doubleml/irm/tests/_utils_qte_manual.py
@@ -1,7 +1,5 @@
 import numpy as np
 from sklearn.base import clone
-import pandas as pd
-from scipy.stats import norm
 
 from ..pq import DoubleMLPQ
 from ...double_ml_data import DoubleMLData
@@ -99,21 +97,3 @@ def boot_qte(scaled_scores, ses, quantiles, all_smpls, n_rep, bootstrap, n_rep_b
                 (n_obs * ses[i_quant, i_rep])
 
     return boot_t_stat
-
-
-def confint_qte(coef, se, quantiles, boot_t_stat=None, joint=True, level=0.95):
-    a = (1 - level)
-    ab = np.array([a / 2, 1. - a / 2])
-    if joint:
-        assert boot_t_stat.shape[2] == 1
-        sim = np.amax(np.abs(boot_t_stat[:, :, 0]), 1)
-        hatc = np.quantile(sim, 1 - a)
-        ci = np.vstack((coef - se * hatc, coef + se * hatc)).T
-    else:
-        fac = norm.ppf(ab)
-        ci = np.vstack((coef + se * fac[0], coef + se * fac[1])).T
-
-    df_ci = pd.DataFrame(ci,
-                         columns=['{:.1f} %'.format(i * 100) for i in ab],
-                         index=quantiles)
-    return df_ci
diff --git a/doubleml/irm/tests/test_qte.py b/doubleml/irm/tests/test_qte.py
index bdcd695da..636a59fac 100644
--- a/doubleml/irm/tests/test_qte.py
+++ b/doubleml/irm/tests/test_qte.py
@@ -9,8 +9,8 @@
 from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import RandomForestClassifier
 
-from ...tests._utils import draw_smpls
-from ._utils_qte_manual import fit_qte, boot_qte, confint_qte
+from ...tests._utils import draw_smpls, confint_manual
+from ._utils_qte_manual import fit_qte, boot_qte
 
 from doubleml.datasets import make_irm_data
 from ...utils._estimation import _default_kde
@@ -94,8 +94,8 @@ def dml_qte_fixture(generate_data_quantiles, learner, normalize_ipw, kde):
                          draw_sample_splitting=True)
 
     ci = dml_qte_obj.confint(joint=False, level=0.95)
-    ci_manual = confint_qte(res_manual['qte'], res_manual['se'], quantiles,
-                            boot_t_stat=None, joint=False, level=0.95)
+    ci_manual = confint_manual(res_manual['qte'], res_manual['se'], quantiles,
+                               boot_t_stat=None, joint=False, level=0.95)
     res_dict = {'coef': dml_qte_obj.coef,
                 'coef_manual': res_manual['qte'],
                 'coef_ext_smpls': dml_qte_obj_ext_smpls.coef,
@@ -120,8 +120,8 @@ def dml_qte_fixture(generate_data_quantiles, learner, normalize_ipw, kde):
         res_dict['boot_t_stat_' + bootstrap + '_manual'] = boot_t_stat
 
         ci = dml_qte_obj.confint(joint=True, level=0.95)
-        ci_manual = confint_qte(res_manual['qte'], res_manual['se'], quantiles,
-                                boot_t_stat=boot_t_stat, joint=True, level=0.95)
+        ci_manual = confint_manual(res_manual['qte'], res_manual['se'], quantiles,
+                                   boot_t_stat=boot_t_stat, joint=True, level=0.95)
         res_dict['boot_ci_' + bootstrap] = ci.to_numpy()
         res_dict['boot_ci_' + bootstrap + '_manual'] = ci_manual.to_numpy()
     return res_dict
diff --git a/doubleml/tests/_utils.py b/doubleml/tests/_utils.py
index b6c8fbc28..fb85b2410 100644
--- a/doubleml/tests/_utils.py
+++ b/doubleml/tests/_utils.py
@@ -1,6 +1,8 @@
 import numpy as np
 from sklearn.model_selection import KFold, GridSearchCV, StratifiedKFold
 from sklearn.base import clone
+import pandas as pd
+from scipy.stats import norm
 
 from ..utils._estimation import _var_est, _aggregate_coefs_and_ses
 
@@ -111,3 +113,21 @@ def generate_dml_dict(psi_a, psi_b):
     }
 
     return doubleml_dict
+
+
+def confint_manual(coef, se, index_names, boot_t_stat=None, joint=True, level=0.95):
+    a = (1 - level)
+    ab = np.array([a / 2, 1. - a / 2])
+    if joint:
+        assert boot_t_stat.shape[2] == 1
+        sim = np.amax(np.abs(boot_t_stat[:, :, 0]), 1)
+        hatc = np.quantile(sim, 1 - a)
+        ci = np.vstack((coef - se * hatc, coef + se * hatc)).T
+    else:
+        fac = norm.ppf(ab)
+        ci = np.vstack((coef + se * fac[0], coef + se * fac[1])).T
+
+    df_ci = pd.DataFrame(ci,
+                         columns=['{:.1f} %'.format(i * 100) for i in ab],
+                         index=index_names)
+    return df_ci

From d488dd571ce7c021732496c763f164e3be937010 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Mon, 22 Jul 2024 14:09:41 +0200
Subject: [PATCH 39/98] add boostrap() to apos

---
 doubleml/irm/apos.py                     | 57 +++++++++++++++++++++
 doubleml/irm/tests/_utils_apos_manual.py | 15 ++++++
 doubleml/irm/tests/test_apos.py          | 65 +++++++++++++++++++++---
 3 files changed, 130 insertions(+), 7 deletions(-)

diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py
index 735fda63b..c4dbd787a 100644
--- a/doubleml/irm/apos.py
+++ b/doubleml/irm/apos.py
@@ -130,6 +130,28 @@ def n_rep(self):
         """
         return self._n_rep
 
+    @property
+    def n_rep_boot(self):
+        """
+        The number of bootstrap replications.
+        """
+        if self._framework is None:
+            n_rep_boot = None
+        else:
+            n_rep_boot = self._framework.n_rep_boot
+        return n_rep_boot
+
+    @property
+    def boot_method(self):
+        """
+        The method to construct the bootstrap replications.
+        """
+        if self._framework is None:
+            method = None
+        else:
+            method = self._framework.boot_method
+        return method
+
     @property
     def coef(self):
         """
@@ -194,6 +216,18 @@ def framework(self):
         """
         return self._framework
 
+    @property
+    def boot_t_stat(self):
+        """
+        Bootstrapped t-statistics for the causal parameter(s) after calling :meth:`fit` and :meth:`bootstrap`
+         (shape (``n_rep_boot``, ``n_quantiles``, ``n_rep``)).
+        """
+        if self._framework is None:
+            boot_t_stat = None
+        else:
+            boot_t_stat = self._framework.boot_t_stat
+        return boot_t_stat
+
     @property
     def modellist(self):
         """
@@ -281,6 +315,29 @@ def confint(self, joint=False, level=0.95):
 
         return df_ci
 
+    def bootstrap(self, method='normal', n_rep_boot=500):
+        """
+        Multiplier bootstrap for DoubleML models.
+
+        Parameters
+        ----------
+        method : str
+            A str (``'Bayes'``, ``'normal'`` or ``'wild'``) specifying the multiplier bootstrap method.
+            Default is ``'normal'``
+
+        n_rep_boot : int
+            The number of bootstrap replications.
+
+        Returns
+        -------
+        self : object
+        """
+        if self._framework is None:
+            raise ValueError('Apply fit() before bootstrap().')
+        self._framework.bootstrap(method=method, n_rep_boot=n_rep_boot)
+
+        return self
+
     def draw_sample_splitting(self):
         """
         Draw sample splitting for DoubleML models.
diff --git a/doubleml/irm/tests/_utils_apos_manual.py b/doubleml/irm/tests/_utils_apos_manual.py
index db5f41c55..cf47d6450 100644
--- a/doubleml/irm/tests/_utils_apos_manual.py
+++ b/doubleml/irm/tests/_utils_apos_manual.py
@@ -4,6 +4,8 @@
 from ..apo import DoubleMLAPO
 from ...double_ml_data import DoubleMLData
 
+from ...tests._utils_boot import draw_weights
+
 
 def fit_apos(y, x, d,
              learner_g, learner_m, treatment_levels, all_smpls, score,
@@ -57,3 +59,16 @@ def fit_apos(y, x, d,
            'all_apos': all_apos, 'all_se': all_se,
            'apo_scaled_score': apo_scaled_score}
     return res
+
+
+def boot_apos(scaled_scores, ses, treatment_levels, all_smpls, n_rep, bootstrap, n_rep_boot):
+    n_treatment_levels = len(treatment_levels)
+    boot_t_stat = np.zeros((n_rep_boot, n_treatment_levels, n_rep))
+    for i_rep in range(n_rep):
+        n_obs = scaled_scores.shape[0]
+        weights = draw_weights(bootstrap, n_rep_boot, n_obs)
+        for i_treatment_levels in range(n_treatment_levels):
+            boot_t_stat[:, i_treatment_levels, i_rep] = np.matmul(weights, scaled_scores[:, i_treatment_levels, i_rep]) / \
+                (n_obs * ses[i_treatment_levels, i_rep])
+
+    return boot_t_stat
diff --git a/doubleml/irm/tests/test_apos.py b/doubleml/irm/tests/test_apos.py
index 55d5252e0..6c39a9678 100644
--- a/doubleml/irm/tests/test_apos.py
+++ b/doubleml/irm/tests/test_apos.py
@@ -10,7 +10,8 @@
 import doubleml as dml
 from doubleml.datasets import make_irm_data_discrete_treatments
 
-from ._utils_apos_manual import fit_apos
+from ._utils_apos_manual import fit_apos, boot_apos
+from ...tests._utils import confint_manual
 
 
 @pytest.fixture(scope='module',
@@ -101,12 +102,44 @@ def dml_apos_fixture(generate_data_irm, learner, n_rep, normalize_ipw, trimming_
         normalize_ipw=normalize_ipw,
         trimming_threshold=trimming_threshold)
 
-    res_dict = {'coef': dml_obj.coef,
-                'coef_ext_smpls': dml_obj_ext_smpls.coef,
-                'coef_manual': res_manual['apos'],
-                'se': dml_obj.se,
-                'se_ext_smpls': dml_obj_ext_smpls.se,
-                'se_manual': res_manual['se']}
+    ci = dml_obj.confint(joint=False, level=0.95)
+    ci_ext_smpls = dml_obj_ext_smpls.confint(joint=False, level=0.95)
+    ci_manual = confint_manual(
+        res_manual['apos'], res_manual['se'], treatment_levels,
+        boot_t_stat=None, joint=False, level=0.95
+        )
+
+    res_dict = {
+        'coef': dml_obj.coef,
+        'coef_ext_smpls': dml_obj_ext_smpls.coef,
+        'coef_manual': res_manual['apos'],
+        'se': dml_obj.se,
+        'se_ext_smpls': dml_obj_ext_smpls.se,
+        'se_manual': res_manual['se'],
+        'boot_methods': boot_methods,
+        'ci': ci.to_numpy(),
+        'ci_ext_smpls': ci_ext_smpls.to_numpy(),
+        'ci_manual': ci_manual.to_numpy(),
+        'apo_model': dml_obj
+    }
+
+    for bootstrap in boot_methods:
+        np.random.seed(42)
+        boot_t_stat = boot_apos(res_manual['apo_scaled_score'], res_manual['all_se'], treatment_levels,
+                                all_smpls, n_rep, bootstrap, n_rep_boot)
+
+        np.random.seed(42)
+        dml_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
+
+        res_dict['boot_t_stat_' + bootstrap] = dml_obj.boot_t_stat
+        res_dict['boot_t_stat_' + bootstrap + '_manual'] = boot_t_stat
+
+        ci = dml_obj.confint(joint=True, level=0.95)
+        ci_manual = confint_manual(res_manual['apos'], res_manual['se'], treatment_levels,
+                                   boot_t_stat=boot_t_stat, joint=True, level=0.95)
+        res_dict['boot_ci_' + bootstrap] = ci.to_numpy()
+        res_dict['boot_ci_' + bootstrap + '_manual'] = ci_manual.to_numpy()
+
     return res_dict
 
 
@@ -118,3 +151,21 @@ def test_dml_apos_coef(dml_apos_fixture):
     assert np.allclose(dml_apos_fixture['coef'],
                        dml_apos_fixture['coef_ext_smpls'],
                        rtol=1e-9, atol=1e-9)
+
+
+@pytest.mark.ci
+def test_dml_apos_se(dml_apos_fixture):
+    assert np.allclose(dml_apos_fixture['se'],
+                       dml_apos_fixture['se_manual'],
+                       rtol=1e-9, atol=1e-9)
+    assert np.allclose(dml_apos_fixture['se'],
+                       dml_apos_fixture['se_ext_smpls'],
+                       rtol=1e-9, atol=1e-9)
+
+
+@pytest.mark.ci
+def test_dml_apos_boot(dml_apos_fixture):
+    for bootstrap in dml_apos_fixture['boot_methods']:
+        assert np.allclose(dml_apos_fixture['boot_t_stat_' + bootstrap],
+                           dml_apos_fixture['boot_t_stat_' + bootstrap + '_manual'],
+                           rtol=1e-9, atol=1e-4)

From 3a415295758a6fb780128c5c81a8344fd6d6d615 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Mon, 22 Jul 2024 14:25:27 +0200
Subject: [PATCH 40/98] add generate summary to utils

---
 doubleml/irm/qte.py            | 13 +++++--------
 doubleml/irm/tests/test_qte.py |  1 +
 doubleml/utils/_descriptive.py | 13 +++++++++++++
 3 files changed, 19 insertions(+), 8 deletions(-)
 create mode 100644 doubleml/utils/_descriptive.py

diff --git a/doubleml/irm/qte.py b/doubleml/irm/qte.py
index bbfc7a411..2a212d77d 100644
--- a/doubleml/irm/qte.py
+++ b/doubleml/irm/qte.py
@@ -15,6 +15,8 @@
 from ..utils.resampling import DoubleMLResampling
 from ..utils._checks import _check_score, _check_trimming, _check_zero_one_treatment, _check_sample_splitting
 
+from ..utils._descriptive import generate_summary
+
 
 class DoubleMLQTE:
     """Double machine learning for quantile treatment effects
@@ -355,18 +357,13 @@ def summary(self):
         """
         A summary for the estimated causal effect after calling :meth:`fit`.
         """
-        col_names = ['coef', 'std err', 't', 'P>|t|']
         if self.framework is None:
+            col_names = ['coef', 'std err', 't', 'P>|t|']
             df_summary = pd.DataFrame(columns=col_names)
         else:
-            summary_stats = np.transpose(np.vstack(
-                [self.coef, self.se,
-                 self.t_stat, self.pval]))
-            df_summary = pd.DataFrame(summary_stats,
-                                      columns=col_names,
-                                      index=self.quantiles)
             ci = self.confint()
-            df_summary = df_summary.join(ci)
+            df_summary = generate_summary(self.coef, self.se, self.t_stat,
+                                          self.pval, ci, self.quantiles)
         return df_summary
 
     def fit(self, n_jobs_models=None, n_jobs_cv=None, store_predictions=True, store_models=False, external_predictions=None):
diff --git a/doubleml/irm/tests/test_qte.py b/doubleml/irm/tests/test_qte.py
index 636a59fac..7c7b8c1df 100644
--- a/doubleml/irm/tests/test_qte.py
+++ b/doubleml/irm/tests/test_qte.py
@@ -181,6 +181,7 @@ def test_doubleml_qte_exceptions():
         _ = dml_obj.smpls
 
 
+@pytest.mark.ci
 def test_doubleml_qte_return_types(dml_qte_fixture):
     assert isinstance(dml_qte_fixture['qte_model'].__str__(), str)
     assert isinstance(dml_qte_fixture['qte_model'].summary, pd.DataFrame)
diff --git a/doubleml/utils/_descriptive.py b/doubleml/utils/_descriptive.py
new file mode 100644
index 000000000..79924a17e
--- /dev/null
+++ b/doubleml/utils/_descriptive.py
@@ -0,0 +1,13 @@
+import numpy as np
+import pandas as pd
+
+
+def generate_summary(coef, se, t_stat, pval, ci, index_names):
+    col_names = ['coef', 'std err', 't', 'P>|t|']
+    summary_stats = np.transpose(np.vstack(
+        [coef, se, t_stat, pval]))
+    df_summary = pd.DataFrame(summary_stats,
+                              columns=col_names,
+                              index=index_names)
+    df_summary = df_summary.join(ci)
+    return df_summary

From 3bde8fd68cc62b3047923b4361fff099a0e2a6c5 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Mon, 22 Jul 2024 14:32:57 +0200
Subject: [PATCH 41/98] add summary and properties to apos.py

---
 doubleml/irm/apos.py            | 38 +++++++++++++++++++++++++++++++++
 doubleml/irm/tests/test_apos.py | 32 ++++++++++++++++++++++++++-
 2 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py
index c4dbd787a..bf86644b6 100644
--- a/doubleml/irm/apos.py
+++ b/doubleml/irm/apos.py
@@ -10,6 +10,7 @@
 from ..double_ml_framework import concat
 
 from ..utils.resampling import DoubleMLResampling
+from ..utils._descriptive import generate_summary
 from ..utils._checks import _check_score, _check_trimming, _check_weights, _check_sample_splitting
 
 
@@ -74,6 +75,14 @@ def __init__(self,
             # initialize all models if splits are known
             self._modellist = self._initialize_models()
 
+    def __str__(self):
+        class_name = self.__class__.__name__
+        header = f'================== {class_name} Object ==================\n'
+        fit_summary = str(self.summary)
+        res = header + \
+            '\n------------------ Fit summary       ------------------\n' + fit_summary
+        return res
+
     @property
     def score(self):
         """
@@ -198,6 +207,21 @@ def all_se(self):
             all_se = self.framework.all_ses
         return all_se
 
+    @property
+    def t_stat(self):
+        """
+        t-statistics for the causal parameter(s) after calling :meth:`fit` (shape (``n_quantiles``,)).
+        """
+        t_stat = self.coef / self.se
+        return t_stat
+
+    @property
+    def pval(self):
+        """
+        p-values for the causal parameter(s) (shape (``n_quantiles``,)).
+        """
+        return self.framework.pvals
+
     @property
     def smpls(self):
         """
@@ -235,6 +259,20 @@ def modellist(self):
         """
         return self._modellist
 
+    @property
+    def summary(self):
+        """
+        A summary for the estimated causal effect after calling :meth:`fit`.
+        """
+        if self.framework is None:
+            col_names = ['coef', 'std err', 't', 'P>|t|']
+            df_summary = pd.DataFrame(columns=col_names)
+        else:
+            ci = self.confint()
+            df_summary = generate_summary(self.coef, self.se, self.t_stat,
+                                          self.pval, ci, self._treatment_levels)
+        return df_summary
+
     def fit(self, n_jobs_models=None, n_jobs_cv=None, store_predictions=True, store_models=False, external_predictions=None):
         """
         Estimate DoubleMLAPOS models.
diff --git a/doubleml/irm/tests/test_apos.py b/doubleml/irm/tests/test_apos.py
index 6c39a9678..23ba99c6c 100644
--- a/doubleml/irm/tests/test_apos.py
+++ b/doubleml/irm/tests/test_apos.py
@@ -80,6 +80,7 @@ def dml_apos_fixture(generate_data_irm, learner, n_rep, normalize_ipw, trimming_
         "trimming_threshold": trimming_threshold,
     }
 
+    unfitted_apos_model = dml.DoubleMLAPOS(dml_data, ml_g, ml_m, **input_args)
     np.random.seed(42)
     dml_obj = dml.DoubleMLAPOS(dml_data, ml_g, ml_m, **input_args)
     dml_obj.fit()
@@ -117,10 +118,13 @@ def dml_apos_fixture(generate_data_irm, learner, n_rep, normalize_ipw, trimming_
         'se_ext_smpls': dml_obj_ext_smpls.se,
         'se_manual': res_manual['se'],
         'boot_methods': boot_methods,
+        'n_treatment_levels': len(treatment_levels),
+        'n_rep': n_rep,
         'ci': ci.to_numpy(),
         'ci_ext_smpls': ci_ext_smpls.to_numpy(),
         'ci_manual': ci_manual.to_numpy(),
-        'apo_model': dml_obj
+        'apos_model': dml_obj,
+        'unfitted_apos_model': unfitted_apos_model
     }
 
     for bootstrap in boot_methods:
@@ -169,3 +173,29 @@ def test_dml_apos_boot(dml_apos_fixture):
         assert np.allclose(dml_apos_fixture['boot_t_stat_' + bootstrap],
                            dml_apos_fixture['boot_t_stat_' + bootstrap + '_manual'],
                            rtol=1e-9, atol=1e-4)
+
+
+@pytest.mark.ci
+def test_dml_apos_ci(dml_apos_fixture):
+    for bootstrap in dml_apos_fixture['boot_methods']:
+        assert np.allclose(dml_apos_fixture['ci'],
+                           dml_apos_fixture['ci_manual'],
+                           rtol=1e-9, atol=1e-4)
+        assert np.allclose(dml_apos_fixture['ci'],
+                           dml_apos_fixture['ci_ext_smpls'],
+                           rtol=1e-9, atol=1e-4)
+        assert np.allclose(dml_apos_fixture['boot_ci_' + bootstrap],
+                           dml_apos_fixture['boot_ci_' + bootstrap + '_manual'],
+                           rtol=1e-9, atol=1e-4)
+
+
+@pytest.mark.ci
+def test_doubleml_apos_return_types(dml_apos_fixture):
+    assert isinstance(dml_apos_fixture['apos_model'].__str__(), str)
+    assert isinstance(dml_apos_fixture['apos_model'].summary, pd.DataFrame)
+
+    assert dml_apos_fixture['apos_model'].all_coef.shape == (
+        dml_apos_fixture['n_treatment_levels'],
+        dml_apos_fixture['n_rep']
+    )
+    assert isinstance(dml_apos_fixture['unfitted_apos_model'].summary, pd.DataFrame)

From cb6ee7f96869499bbb990e6e255dfb4a6963421a Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Mon, 22 Jul 2024 14:56:21 +0200
Subject: [PATCH 42/98] Update test_apos.py

---
 doubleml/irm/tests/test_apos.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/doubleml/irm/tests/test_apos.py b/doubleml/irm/tests/test_apos.py
index 23ba99c6c..ff2a378f5 100644
--- a/doubleml/irm/tests/test_apos.py
+++ b/doubleml/irm/tests/test_apos.py
@@ -48,15 +48,11 @@ def treatment_levels(request):
 
 
 @pytest.fixture(scope='module')
-def dml_apos_fixture(generate_data_irm, learner, n_rep, normalize_ipw, trimming_threshold, treatment_levels):
+def dml_apos_fixture(learner, n_rep, normalize_ipw, trimming_threshold, treatment_levels):
     boot_methods = ['normal']
     n_folds = 2
     n_rep_boot = 499
 
-    # Set machine learning methods for m & g
-    ml_g = clone(learner[0])
-    ml_m = clone(learner[1])
-
     np.random.seed(3141)
     n_obs = 500
     data = make_irm_data_discrete_treatments(n_obs=n_obs)
@@ -71,24 +67,27 @@ def dml_apos_fixture(generate_data_irm, learner, n_rep, normalize_ipw, trimming_
     dml_data = dml.DoubleMLData(df, 'y', 'd')
 
     input_args = {
+        'obj_dml_data': dml_data,
+        'ml_g': clone(learner[0]),
+        'ml_m': clone(learner[1]),
         "treatment_levels": treatment_levels,
         "n_folds": n_folds,
         "n_rep": n_rep,
         "score": 'APO',
         "normalize_ipw": normalize_ipw,
         "trimming_rule": 'truncate',
-        "trimming_threshold": trimming_threshold,
-    }
+            "trimming_threshold": trimming_threshold,
+        }
 
-    unfitted_apos_model = dml.DoubleMLAPOS(dml_data, ml_g, ml_m, **input_args)
+    unfitted_apos_model = dml.DoubleMLAPOS(**input_args)
     np.random.seed(42)
-    dml_obj = dml.DoubleMLAPOS(dml_data, ml_g, ml_m, **input_args)
+    dml_obj = dml.DoubleMLAPOS(**input_args)
     dml_obj.fit()
     # get the sample splitting
     all_smpls = dml_obj.smpls
 
     np.random.seed(42)
-    dml_obj_ext_smpls = dml.DoubleMLAPOS(dml_data, ml_g, ml_m, **input_args, draw_sample_splitting=False)
+    dml_obj_ext_smpls = dml.DoubleMLAPOS(**input_args, draw_sample_splitting=False)
     dml_obj_ext_smpls.set_sample_splitting(dml_obj.smpls)
     dml_obj_ext_smpls.fit()
 

From 72ad8595a8c3f695832e48ed9e325a8eae7658d4 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Mon, 22 Jul 2024 14:56:38 +0200
Subject: [PATCH 43/98] Update test_apos.py

---
 doubleml/irm/tests/test_apos.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doubleml/irm/tests/test_apos.py b/doubleml/irm/tests/test_apos.py
index ff2a378f5..0d1dc9da1 100644
--- a/doubleml/irm/tests/test_apos.py
+++ b/doubleml/irm/tests/test_apos.py
@@ -76,7 +76,7 @@ def dml_apos_fixture(learner, n_rep, normalize_ipw, trimming_threshold, treatmen
         "score": 'APO',
         "normalize_ipw": normalize_ipw,
         "trimming_rule": 'truncate',
-            "trimming_threshold": trimming_threshold,
+        "trimming_threshold": trimming_threshold,
         }
 
     unfitted_apos_model = dml.DoubleMLAPOS(**input_args)

From c82ca7ea726f758897bdce8829ac42d63fb886a2 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Mon, 22 Jul 2024 15:00:40 +0200
Subject: [PATCH 44/98] Create test_apos_weighted_scores.py

---
 .../irm/tests/test_apos_weighted_scores.py    | 97 +++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 doubleml/irm/tests/test_apos_weighted_scores.py

diff --git a/doubleml/irm/tests/test_apos_weighted_scores.py b/doubleml/irm/tests/test_apos_weighted_scores.py
new file mode 100644
index 000000000..84e6ac1c3
--- /dev/null
+++ b/doubleml/irm/tests/test_apos_weighted_scores.py
@@ -0,0 +1,97 @@
+import pytest
+import numpy as np
+import pandas as pd
+
+from sklearn.base import clone
+from sklearn.linear_model import LogisticRegression, LinearRegression
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+
+import doubleml as dml
+from doubleml.datasets import make_irm_data_discrete_treatments
+
+
+@pytest.fixture(scope='module',
+                params=[[LinearRegression(),
+                         LogisticRegression(solver='lbfgs', max_iter=250)],
+                        [RandomForestRegressor(max_depth=5, n_estimators=10, random_state=42),
+                         RandomForestClassifier(max_depth=5, n_estimators=10, random_state=42)]])
+def learner(request):
+    return request.param
+
+
+@pytest.fixture(scope='module',
+                params=['APO'])
+def score(request):
+    return request.param
+
+
+@pytest.fixture(scope='module',
+                params=[False, True])
+def normalize_ipw(request):
+    return request.param
+
+
+@pytest.fixture(scope='module',
+                params=[0.2, 0.15])
+def trimming_threshold(request):
+    return request.param
+
+
+@pytest.fixture(scope='module',
+                params=[[0, 1, 2], [0]])
+def treatment_levels(request):
+    return request.param
+
+
+@pytest.fixture(scope='module')
+def weighted_apos_score_fixture(learner, score, normalize_ipw, trimming_threshold,
+                                treatment_levels):
+    n_obs = 500
+    n_folds = 2
+
+    # collect data
+    data = make_irm_data_discrete_treatments(n_obs=n_obs)
+    y = data['y']
+    x = data['x']
+    d = data['d']
+    df = pd.DataFrame(
+        np.column_stack((y, d, x)),
+        columns=['y', 'd'] + ['x' + str(i) for i in range(data['x'].shape[1])]
+    )
+
+    obj_dml_data = dml.DoubleMLData(df, 'y', 'd')
+
+    input_args = {
+        'obj_dml_data': obj_dml_data,
+        'ml_g': clone(learner[0]),
+        'ml_m': clone(learner[1]),
+        'treatment_levels': treatment_levels,
+        'n_folds': n_folds,
+        'score': score,
+        'normalize_ipw': normalize_ipw,
+        'trimming_threshold': trimming_threshold,
+        'trimming_rule': 'truncate'
+    }
+
+    np.random.seed(3141)
+    dml_obj = dml.DoubleMLAPOS(**input_args)
+    dml_obj.fit()
+
+    weights = 0.5 * np.ones_like(obj_dml_data.y)
+    dml_obj_weighted = dml.DoubleMLAPOS(draw_sample_splitting=False,
+                                        weights=weights,
+                                        **input_args)
+    dml_obj_weighted.set_sample_splitting(all_smpls=dml_obj.smpls)
+    dml_obj_weighted.fit()
+
+    result_dict = {
+        'coef': dml_obj.coef,
+        'weighted_coef': dml_obj_weighted.coef,
+    }
+    return result_dict
+
+
+@pytest.mark.ci
+def test_apos_weighted_coef(weighted_apos_score_fixture):
+    assert np.allclose(0.5 * weighted_apos_score_fixture['coef'],
+                       weighted_apos_score_fixture['weighted_coef'])

From 5d07fe67ac0b3e9642113bb1e2966660661ae53a Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Mon, 22 Jul 2024 15:04:46 +0200
Subject: [PATCH 45/98] Create test_apos_classfier.py

---
 doubleml/irm/tests/test_apos_classfier.py | 200 ++++++++++++++++++++++
 1 file changed, 200 insertions(+)
 create mode 100644 doubleml/irm/tests/test_apos_classfier.py

diff --git a/doubleml/irm/tests/test_apos_classfier.py b/doubleml/irm/tests/test_apos_classfier.py
new file mode 100644
index 000000000..9c3e7d351
--- /dev/null
+++ b/doubleml/irm/tests/test_apos_classfier.py
@@ -0,0 +1,200 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from sklearn.base import clone
+
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+
+import doubleml as dml
+from doubleml.datasets import make_irm_data_discrete_treatments
+
+from ._utils_apos_manual import fit_apos, boot_apos
+from ...tests._utils import confint_manual
+
+
+@pytest.fixture(scope='module',
+                params=[[LogisticRegression(solver='lbfgs', max_iter=250),
+                         LogisticRegression(solver='lbfgs', max_iter=250)],
+                        [RandomForestClassifier(max_depth=2, n_estimators=10, random_state=42),
+                         RandomForestClassifier(max_depth=2, n_estimators=10, random_state=42)]])
+def learner(request):
+    return request.param
+
+
+@pytest.fixture(scope='module',
+                params=[1])
+def n_rep(request):
+    return request.param
+
+
+@pytest.fixture(scope='module',
+                params=[False, True])
+def normalize_ipw(request):
+    return request.param
+
+
+@pytest.fixture(scope='module',
+                params=[0.2, 0.15])
+def trimming_threshold(request):
+    return request.param
+
+
+@pytest.fixture(scope='module',
+                params=[[0, 1, 2], [0]])
+def treatment_levels(request):
+    return request.param
+
+
+@pytest.fixture(scope='module')
+def dml_apos_classifier_fixture(learner, n_rep, normalize_ipw, trimming_threshold, treatment_levels):
+    boot_methods = ['normal']
+    n_folds = 2
+    n_rep_boot = 499
+
+    np.random.seed(3141)
+    n_obs = 500
+    data = make_irm_data_discrete_treatments(n_obs=n_obs)
+    y = np.random.binomial(1, 0.5, n_obs)
+    x = data['x']
+    d = data['d']
+    df = pd.DataFrame(
+        np.column_stack((y, d, x)),
+        columns=['y', 'd'] + ['x' + str(i) for i in range(data['x'].shape[1])]
+    )
+
+    dml_data = dml.DoubleMLData(df, 'y', 'd')
+
+    input_args = {
+        'obj_dml_data': dml_data,
+        'ml_g': clone(learner[0]),
+        'ml_m': clone(learner[1]),
+        "treatment_levels": treatment_levels,
+        "n_folds": n_folds,
+        "n_rep": n_rep,
+        "score": 'APO',
+        "normalize_ipw": normalize_ipw,
+        "trimming_rule": 'truncate',
+        "trimming_threshold": trimming_threshold,
+        }
+
+    unfitted_apos_model = dml.DoubleMLAPOS(**input_args)
+    np.random.seed(42)
+    dml_obj = dml.DoubleMLAPOS(**input_args)
+    dml_obj.fit()
+    # get the sample splitting
+    all_smpls = dml_obj.smpls
+
+    np.random.seed(42)
+    dml_obj_ext_smpls = dml.DoubleMLAPOS(**input_args, draw_sample_splitting=False)
+    dml_obj_ext_smpls.set_sample_splitting(dml_obj.smpls)
+    dml_obj_ext_smpls.fit()
+
+    np.random.seed(42)
+    res_manual = fit_apos(
+        y, x, d,
+        clone(learner[0]), clone(learner[1]),
+        treatment_levels=treatment_levels,
+        all_smpls=all_smpls,
+        score='APO',
+        trimming_rule='truncate',
+        normalize_ipw=normalize_ipw,
+        trimming_threshold=trimming_threshold)
+
+    ci = dml_obj.confint(joint=False, level=0.95)
+    ci_ext_smpls = dml_obj_ext_smpls.confint(joint=False, level=0.95)
+    ci_manual = confint_manual(
+        res_manual['apos'], res_manual['se'], treatment_levels,
+        boot_t_stat=None, joint=False, level=0.95
+        )
+
+    res_dict = {
+        'coef': dml_obj.coef,
+        'coef_ext_smpls': dml_obj_ext_smpls.coef,
+        'coef_manual': res_manual['apos'],
+        'se': dml_obj.se,
+        'se_ext_smpls': dml_obj_ext_smpls.se,
+        'se_manual': res_manual['se'],
+        'boot_methods': boot_methods,
+        'n_treatment_levels': len(treatment_levels),
+        'n_rep': n_rep,
+        'ci': ci.to_numpy(),
+        'ci_ext_smpls': ci_ext_smpls.to_numpy(),
+        'ci_manual': ci_manual.to_numpy(),
+        'apos_model': dml_obj,
+        'unfitted_apos_model': unfitted_apos_model
+    }
+
+    for bootstrap in boot_methods:
+        np.random.seed(42)
+        boot_t_stat = boot_apos(res_manual['apo_scaled_score'], res_manual['all_se'], treatment_levels,
+                                all_smpls, n_rep, bootstrap, n_rep_boot)
+
+        np.random.seed(42)
+        dml_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
+
+        res_dict['boot_t_stat_' + bootstrap] = dml_obj.boot_t_stat
+        res_dict['boot_t_stat_' + bootstrap + '_manual'] = boot_t_stat
+
+        ci = dml_obj.confint(joint=True, level=0.95)
+        ci_manual = confint_manual(res_manual['apos'], res_manual['se'], treatment_levels,
+                                   boot_t_stat=boot_t_stat, joint=True, level=0.95)
+        res_dict['boot_ci_' + bootstrap] = ci.to_numpy()
+        res_dict['boot_ci_' + bootstrap + '_manual'] = ci_manual.to_numpy()
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_dml_apos_coef(dml_apos_classifier_fixture):
+    assert np.allclose(dml_apos_classifier_fixture['coef'],
+                       dml_apos_classifier_fixture['coef_manual'],
+                       rtol=1e-9, atol=1e-9)
+    assert np.allclose(dml_apos_classifier_fixture['coef'],
+                       dml_apos_classifier_fixture['coef_ext_smpls'],
+                       rtol=1e-9, atol=1e-9)
+
+
+@pytest.mark.ci
+def test_dml_apos_se(dml_apos_classifier_fixture):
+    assert np.allclose(dml_apos_classifier_fixture['se'],
+                       dml_apos_classifier_fixture['se_manual'],
+                       rtol=1e-9, atol=1e-9)
+    assert np.allclose(dml_apos_classifier_fixture['se'],
+                       dml_apos_classifier_fixture['se_ext_smpls'],
+                       rtol=1e-9, atol=1e-9)
+
+
+@pytest.mark.ci
+def test_dml_apos_boot(dml_apos_classifier_fixture):
+    for bootstrap in dml_apos_classifier_fixture['boot_methods']:
+        assert np.allclose(dml_apos_classifier_fixture['boot_t_stat_' + bootstrap],
+                           dml_apos_classifier_fixture['boot_t_stat_' + bootstrap + '_manual'],
+                           rtol=1e-9, atol=1e-4)
+
+
+@pytest.mark.ci
+def test_dml_apos_ci(dml_apos_classifier_fixture):
+    for bootstrap in dml_apos_classifier_fixture['boot_methods']:
+        assert np.allclose(dml_apos_classifier_fixture['ci'],
+                           dml_apos_classifier_fixture['ci_manual'],
+                           rtol=1e-9, atol=1e-4)
+        assert np.allclose(dml_apos_classifier_fixture['ci'],
+                           dml_apos_classifier_fixture['ci_ext_smpls'],
+                           rtol=1e-9, atol=1e-4)
+        assert np.allclose(dml_apos_classifier_fixture['boot_ci_' + bootstrap],
+                           dml_apos_classifier_fixture['boot_ci_' + bootstrap + '_manual'],
+                           rtol=1e-9, atol=1e-4)
+
+
+@pytest.mark.ci
+def test_doubleml_apos_return_types(dml_apos_classifier_fixture):
+    assert isinstance(dml_apos_classifier_fixture['apos_model'].__str__(), str)
+    assert isinstance(dml_apos_classifier_fixture['apos_model'].summary, pd.DataFrame)
+
+    assert dml_apos_classifier_fixture['apos_model'].all_coef.shape == (
+        dml_apos_classifier_fixture['n_treatment_levels'],
+        dml_apos_classifier_fixture['n_rep']
+    )
+    assert isinstance(dml_apos_classifier_fixture['unfitted_apos_model'].summary, pd.DataFrame)

From 402a30b45644d6eb101b7867b8aab5e067ecbf8a Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Mon, 22 Jul 2024 16:03:40 +0200
Subject: [PATCH 46/98] add treatment_levels property

---
 doubleml/irm/apos.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py
index bf86644b6..2d000db22 100644
--- a/doubleml/irm/apos.py
+++ b/doubleml/irm/apos.py
@@ -97,6 +97,13 @@ def n_treatment_levels(self):
         """
         return self._n_treatment_levels
 
+    @property
+    def treatment_levels(self):
+        """
+        The evaluated treatment levels.
+        """
+        return self._treatment_levels
+
     @property
     def normalize_ipw(self):
         """
@@ -308,7 +315,7 @@ def fit(self, n_jobs_models=None, n_jobs_cv=None, store_predictions=True, store_
         if external_predictions is not None:
             raise NotImplementedError(f"External predictions not implemented for {self.__class__.__name__}.")
 
-        # parallel estimation of the quantiles
+        # parallel estimation of the models
         parallel = Parallel(n_jobs=n_jobs_models, verbose=0, pre_dispatch='2*n_jobs')
         fitted_models = parallel(delayed(self._fit_model)(i_level, n_jobs_cv, store_predictions, store_models)
                                  for i_level in range(self.n_treatment_levels))

From 25e710bd22ccb86e76eadd8c0f8ffdf29d5363bd Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Mon, 22 Jul 2024 21:17:47 +0200
Subject: [PATCH 47/98] add simple average treatment effects

---
 doubleml/irm/apos.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py
index 2d000db22..d889d967a 100644
--- a/doubleml/irm/apos.py
+++ b/doubleml/irm/apos.py
@@ -459,6 +459,33 @@ def set_sample_splitting(self, all_smpls, all_smpls_cluster=None):
 
         return self
 
+    def average_treatment_effect(self, baseline_level=None):
+        """
+        Average treatment effects for DoubleMLAPOS models.
+
+        Parameters
+        ----------
+        baseline_level : None or int
+            The baseline level for the average treatment effect.
+            Default is ``None``.
+
+        Returns
+        -------
+        ate : pd.Series
+            A data frame with the average treatment effect(s).
+        """
+
+        if self.framework is None:
+            raise ValueError('Apply fit() before average_treatment_effect().')
+
+        i_baseline_level = self.treatment_levels.tolist().index(baseline_level)
+        baseline_framework = self.modellist[i_baseline_level].framework
+
+        ate_frameworks = [model.framework - baseline_framework for i, model in
+                          enumerate(self.modellist) if i != i_baseline_level]
+        ate = concat(ate_frameworks)
+        return ate
+
     def _fit_model(self, i_level, n_jobs_cv=None, store_predictions=True, store_models=False):
 
         model = self.modellist[i_level]

From 660fdce19bd40aaba8619d007fb310ac9d97f7fe Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Tue, 23 Jul 2024 08:48:55 +0200
Subject: [PATCH 48/98] add optional treatment_names to framework

---
 doubleml/double_ml_framework.py             | 41 +++++++++++++++++++--
 doubleml/tests/test_framework_exceptions.py | 18 +++++++++
 doubleml/utils/_descriptive.py              |  8 ++--
 3 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/doubleml/double_ml_framework.py b/doubleml/double_ml_framework.py
index c528fef84..8b603f5d0 100644
--- a/doubleml/double_ml_framework.py
+++ b/doubleml/double_ml_framework.py
@@ -9,6 +9,7 @@
 from .utils._estimation import _draw_weights, _aggregate_coefs_and_ses, _var_est
 from .utils._checks import _check_bootstrap, _check_framework_compatibility, _check_in_zero_one, \
     _check_float, _check_integer, _check_bool, _check_benchmarks
+from .utils._descriptive import generate_summary
 from .utils._plots import _sensitivity_contour_plot
 
 
@@ -17,10 +18,10 @@ class DoubleMLFramework():
 
     Parameters
     ----------
-   doubleml_dict : :dict
+    doubleml_dict : :dict
         A dictionary providing the estimated parameters and normalized scores. Keys have to be 'thetas', 'ses',
-         'all_thetas', 'all_ses', 'var_scaling_factors' and 'scaled_psi'.
-          Values have to be numpy arrays with the corresponding shapes.
+        'all_thetas', 'all_ses', 'var_scaling_factors' and 'scaled_psi'.
+        Values have to be numpy arrays with the corresponding shapes.
 
     """
 
@@ -57,6 +58,12 @@ def __init__(
 
         # check if all sizes match
         self._check_framework_shapes()
+
+        self._treatment_names = None
+        if 'treatment_names' in doubleml_dict.keys():
+            self._check_treatment_names(doubleml_dict['treatment_names'])
+            self._treatment_names = doubleml_dict['treatment_names']
+
         # initialize bootstrap distribution
         self._boot_t_stat = None
         self._boot_method = None
@@ -196,6 +203,16 @@ def sensitivity_params(self):
         """
         return self._sensitivity_params
 
+    @property
+    def summary(self):
+        """
+        A summary for the estimated causal effect ``theta``.
+        """
+        ci = self.confint()
+        df_summary = generate_summary(self.thetas, self.ses, self.t_stats,
+                                      self.pvals, ci, self._treatment_names)
+        return df_summary
+
     def __add__(self, other):
 
         if isinstance(other, DoubleMLFramework):
@@ -612,8 +629,11 @@ def confint(self, joint=False, level=0.95):
              self.all_thetas + self.all_ses * critical_values),
             axis=1)
         ci = np.median(self._all_cis, axis=2)
-        # TODO: add treatment names
         df_ci = pd.DataFrame(ci, columns=['{:.1f} %'.format(i * 100) for i in percentages])
+
+        if self._treatment_names is not None:
+            df_ci.set_index(pd.Index(self._treatment_names), inplace=True)
+
         return df_ci
 
     def bootstrap(self, method='normal', n_rep_boot=500):
@@ -944,6 +964,19 @@ def _check_framework_shapes(self):
 
         return None
 
+    def _check_treatment_names(self, treatment_names):
+        if not isinstance(treatment_names, list):
+            raise TypeError('treatment_names must be a list. '
+                            f'Got {str(treatment_names)} of type {str(type(treatment_names))}.')
+        is_str = [isinstance(name, str) for name in treatment_names]
+        if not all(is_str):
+            raise TypeError('treatment_names must be a list of strings. '
+                            f'At least one element is not a string: {str(treatment_names)}.')
+        if len(treatment_names) != self._n_thetas:
+            raise ValueError('The length of treatment_names does not match the number of treatments. '
+                             f'Got {self._n_thetas} treatments and {len(treatment_names)} treatment names.')
+        return None
+
 
 def concat(objs):
     """
diff --git a/doubleml/tests/test_framework_exceptions.py b/doubleml/tests/test_framework_exceptions.py
index 7dc8849b2..45cf14cd5 100644
--- a/doubleml/tests/test_framework_exceptions.py
+++ b/doubleml/tests/test_framework_exceptions.py
@@ -142,6 +142,24 @@ def test_input_exceptions():
         test_dict['cluster_dict'] = {'cluster_ids': np.ones(shape=(n_obs, n_rep))}
         DoubleMLFramework(test_dict)
 
+    msg = "treatment_names must be a list. Got 1 of type <class 'int'>."
+    with pytest.raises(TypeError, match=msg):
+        test_dict = copy.deepcopy(doubleml_dict)
+        test_dict['treatment_names'] = 1
+        DoubleMLFramework(test_dict)
+
+    msg = r"treatment_names must be a list of strings. At least one element is not a string: \['test', 1\]."
+    with pytest.raises(TypeError, match=msg):
+        test_dict = copy.deepcopy(doubleml_dict)
+        test_dict['treatment_names'] = ['test', 1]
+        DoubleMLFramework(test_dict)
+
+    msg = "The length of treatment_names does not match the number of treatments. Got 2 treatments and 3 treatment names."
+    with pytest.raises(ValueError, match=msg):
+        test_dict = copy.deepcopy(doubleml_dict)
+        test_dict['treatment_names'] = ['test', 'test2', 'test3']
+        DoubleMLFramework(test_dict)
+
 
 def test_operation_exceptions():
     # addition
diff --git a/doubleml/utils/_descriptive.py b/doubleml/utils/_descriptive.py
index 79924a17e..54144bc8c 100644
--- a/doubleml/utils/_descriptive.py
+++ b/doubleml/utils/_descriptive.py
@@ -2,12 +2,12 @@
 import pandas as pd
 
 
-def generate_summary(coef, se, t_stat, pval, ci, index_names):
+def generate_summary(coef, se, t_stat, pval, ci, index_names=None):
     col_names = ['coef', 'std err', 't', 'P>|t|']
     summary_stats = np.transpose(np.vstack(
         [coef, se, t_stat, pval]))
-    df_summary = pd.DataFrame(summary_stats,
-                              columns=col_names,
-                              index=index_names)
+    df_summary = pd.DataFrame(summary_stats, columns=col_names)
+    if index_names is not None:
+        df_summary.index = index_names
     df_summary = df_summary.join(ci)
     return df_summary

From 02388d6fd1988096ae3904b7ef88f8bd8bbc29ce Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Tue, 23 Jul 2024 08:52:44 +0200
Subject: [PATCH 49/98] fix dimensions in docstrings

---
 doubleml/irm/apos.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py
index d889d967a..1fba6d7a1 100644
--- a/doubleml/irm/apos.py
+++ b/doubleml/irm/apos.py
@@ -217,15 +217,14 @@ def all_se(self):
     @property
     def t_stat(self):
         """
-        t-statistics for the causal parameter(s) after calling :meth:`fit` (shape (``n_quantiles``,)).
+        t-statistics for the causal parameter(s) after calling :meth:`fit` (shape (``n_treatment_levels``,)).
         """
-        t_stat = self.coef / self.se
-        return t_stat
+        return self.framework.t_stats
 
     @property
     def pval(self):
         """
-        p-values for the causal parameter(s) (shape (``n_quantiles``,)).
+        p-values for the causal parameter(s) (shape (``n_treatment_levels``,)).
         """
         return self.framework.pvals
 
@@ -251,7 +250,7 @@ def framework(self):
     def boot_t_stat(self):
         """
         Bootstrapped t-statistics for the causal parameter(s) after calling :meth:`fit` and :meth:`bootstrap`
-         (shape (``n_rep_boot``, ``n_quantiles``, ``n_rep``)).
+         (shape (``n_rep_boot``, ``n_treatment_levels``, ``n_rep``)).
         """
         if self._framework is None:
             boot_t_stat = None

From a17644f58b426b04bcca0858f427a9ef0227fce9 Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Tue, 23 Jul 2024 09:00:16 +0200
Subject: [PATCH 50/98] add setter for treatment_names in framework

---
 doubleml/double_ml_framework.py             | 14 +++++++++++++-
 doubleml/tests/test_framework_exceptions.py |  9 +++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/doubleml/double_ml_framework.py b/doubleml/double_ml_framework.py
index 8b603f5d0..633d6464e 100644
--- a/doubleml/double_ml_framework.py
+++ b/doubleml/double_ml_framework.py
@@ -203,10 +203,22 @@ def sensitivity_params(self):
         """
         return self._sensitivity_params
 
+    @property
+    def treatment_names(self):
+        """
+        Names of the treatments.
+        """
+        return self._treatment_names
+
+    @treatment_names.setter
+    def treatment_names(self, value):
+        self._check_treatment_names(value)
+        self._treatment_names = value
+
     @property
     def summary(self):
         """
-        A summary for the estimated causal effect ``theta``.
+        A summary for the estimated causal parameters ``thetas``.
         """
         ci = self.confint()
         df_summary = generate_summary(self.thetas, self.ses, self.t_stats,
diff --git a/doubleml/tests/test_framework_exceptions.py b/doubleml/tests/test_framework_exceptions.py
index 45cf14cd5..b80cfac27 100644
--- a/doubleml/tests/test_framework_exceptions.py
+++ b/doubleml/tests/test_framework_exceptions.py
@@ -142,23 +142,32 @@ def test_input_exceptions():
         test_dict['cluster_dict'] = {'cluster_ids': np.ones(shape=(n_obs, n_rep))}
         DoubleMLFramework(test_dict)
 
+    test_dict = copy.deepcopy(doubleml_dict)
+    framework_names = DoubleMLFramework(test_dict)
+
     msg = "treatment_names must be a list. Got 1 of type <class 'int'>."
     with pytest.raises(TypeError, match=msg):
         test_dict = copy.deepcopy(doubleml_dict)
         test_dict['treatment_names'] = 1
         DoubleMLFramework(test_dict)
+    with pytest.raises(TypeError, match=msg):
+        framework_names.treatment_names = 1
 
     msg = r"treatment_names must be a list of strings. At least one element is not a string: \['test', 1\]."
     with pytest.raises(TypeError, match=msg):
         test_dict = copy.deepcopy(doubleml_dict)
         test_dict['treatment_names'] = ['test', 1]
         DoubleMLFramework(test_dict)
+    with pytest.raises(TypeError, match=msg):
+        framework_names.treatment_names = ['test', 1]
 
     msg = "The length of treatment_names does not match the number of treatments. Got 2 treatments and 3 treatment names."
     with pytest.raises(ValueError, match=msg):
         test_dict = copy.deepcopy(doubleml_dict)
         test_dict['treatment_names'] = ['test', 'test2', 'test3']
         DoubleMLFramework(test_dict)
+    with pytest.raises(ValueError, match=msg):
+        framework_names.treatment_names = ['test', 'test2', 'test3']
 
 
 def test_operation_exceptions():

From 2cea2cedb672cd38c0eb5ddb65bf48eb4f914db5 Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Tue, 23 Jul 2024 09:24:29 +0200
Subject: [PATCH 51/98] rename to causal_contrast

---
 doubleml/irm/apos.py                       | 32 ++++++++++++----------
 doubleml/irm/tests/test_apos_exceptions.py |  8 ++++++
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py
index 1fba6d7a1..8c6ad8e04 100644
--- a/doubleml/irm/apos.py
+++ b/doubleml/irm/apos.py
@@ -458,32 +458,36 @@ def set_sample_splitting(self, all_smpls, all_smpls_cluster=None):
 
         return self
 
-    def average_treatment_effect(self, baseline_level=None):
+    def causal_contrast(self, reference_level):
         """
-        Average treatment effects for DoubleMLAPOS models.
+        Average causal contrasts for DoubleMLAPOS models. Estimates the difference in
+        average potential outcomes between the treatment levels and the reference level.
+        The reference has to be one of the treatment levels.
 
         Parameters
         ----------
-        baseline_level : None or int
-            The baseline level for the average treatment effect.
-            Default is ``None``.
+        reference_level :
+            The reference level for the difference in average potential outcomes.
+            Has to be an element of ``treatment_levels``.
 
         Returns
         -------
-        ate : pd.Series
-            A data frame with the average treatment effect(s).
+        acc : DoubleMLFramework
+            A DoubleMLFramwork class for average causal contrast(s).
         """
 
         if self.framework is None:
-            raise ValueError('Apply fit() before average_treatment_effect().')
+            raise ValueError('Apply fit() before causal_contrast().')
 
-        i_baseline_level = self.treatment_levels.tolist().index(baseline_level)
-        baseline_framework = self.modellist[i_baseline_level].framework
+        i_reference_level = self.treatment_levels.tolist().index(reference_level)
+        reference_framework = self.modellist[i_reference_level].framework
 
-        ate_frameworks = [model.framework - baseline_framework for i, model in
-                          enumerate(self.modellist) if i != i_baseline_level]
-        ate = concat(ate_frameworks)
-        return ate
+        acc_frameworks = [model.framework - reference_framework for i, model in
+                          enumerate(self.modellist) if i != i_reference_level]
+        acc = concat(acc_frameworks)
+        acc.treatment_names = [f"{self.treatment_levels[i]} vs {reference_level}" for i in
+                               range(self.n_treatment_levels) if i != i_reference_level]
+        return acc
 
     def _fit_model(self, i_level, n_jobs_cv=None, store_predictions=True, store_models=False):
 
diff --git a/doubleml/irm/tests/test_apos_exceptions.py b/doubleml/irm/tests/test_apos_exceptions.py
index 9081a4e4a..ab123d815 100644
--- a/doubleml/irm/tests/test_apos_exceptions.py
+++ b/doubleml/irm/tests/test_apos_exceptions.py
@@ -70,3 +70,11 @@ def test_apos_exception_ipw_normalization():
     msg = "Normalization indicator has to be boolean. Object of type <class 'int'> passed."
     with pytest.raises(TypeError, match=msg):
         _ = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=0, normalize_ipw=1)
+
+
+@pytest.mark.ci
+def test_causal_contrast_exceptions():
+    msg = r"Apply fit() before causal_contrast()."
+    with pytest.raises(ValueError, match=msg):
+        dml_obj = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=0)
+        dml_obj.causal_contrast()

From 739641e57a21d8d43cf945c8ace80c4bbb74a27d Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Tue, 23 Jul 2024 10:20:26 +0200
Subject: [PATCH 52/98] update treatment_levels to allow for iterable objects

---
 doubleml/irm/apos.py                       | 44 +++++++++++++++-------
 doubleml/irm/tests/test_apos_exceptions.py | 20 ++++++++--
 2 files changed, 47 insertions(+), 17 deletions(-)

diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py
index 8c6ad8e04..8efb437c7 100644
--- a/doubleml/irm/apos.py
+++ b/doubleml/irm/apos.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pandas as pd
+from collections.abc import Iterable
 
 from sklearn.base import clone
 
@@ -35,8 +36,7 @@ def __init__(self,
         self._is_cluster_data = isinstance(obj_dml_data, DoubleMLClusterData)
         self._check_data(self._dml_data)
 
-        self._treatment_levels = np.asarray(treatment_levels).reshape((-1, ))
-        self._check_treatment_levels()
+        self._treatment_levels = self._check_treatment_levels(treatment_levels)
         self._n_treatment_levels = len(self._treatment_levels)
 
         self._normalize_ipw = normalize_ipw
@@ -458,16 +458,17 @@ def set_sample_splitting(self, all_smpls, all_smpls_cluster=None):
 
         return self
 
-    def causal_contrast(self, reference_level):
+    def causal_contrast(self, reference_levels):
         """
         Average causal contrasts for DoubleMLAPOS models. Estimates the difference in
-        average potential outcomes between the treatment levels and the reference level.
-        The reference has to be one of the treatment levels.
+        average potential outcomes between the treatment levels and the reference levels.
+        The reference levels have to be a subset of the treatment levels or a single
+        treatment level.
 
         Parameters
         ----------
-        reference_level :
-            The reference level for the difference in average potential outcomes.
+        reference_levels :
+            The reference levels for the difference in average potential outcomes.
             Has to be an element of ``treatment_levels``.
 
         Returns
@@ -478,14 +479,23 @@ def causal_contrast(self, reference_level):
 
         if self.framework is None:
             raise ValueError('Apply fit() before causal_contrast().')
-
-        i_reference_level = self.treatment_levels.tolist().index(reference_level)
+        is_iterable = isinstance(reference_levels, Iterable)
+        if not is_iterable:
+            reference_levels = [reference_levels]
+        is_treatment_level_subset = set(reference_levels).issubset(set(self.treatment_levels))
+        if not is_treatment_level_subset:
+            raise ValueError('Invalid reference_levels. reference_levels has to be an iterable subset of treatment_levels or '
+                             'a single treatment level.')
+
+        for ref_lvl in reference_levels:
+            i_ref_lvl = self.treatment_levels.to
+        i_ref_lvls = self.treatment_levels.tolist().index(reference_levels)
         reference_framework = self.modellist[i_reference_level].framework
 
         acc_frameworks = [model.framework - reference_framework for i, model in
                           enumerate(self.modellist) if i != i_reference_level]
         acc = concat(acc_frameworks)
-        acc.treatment_names = [f"{self.treatment_levels[i]} vs {reference_level}" for i in
+        acc.treatment_names = [f"{self.treatment_levels[i]} vs {reference_levels}" for i in
                                range(self.n_treatment_levels) if i != i_reference_level]
         return acc
 
@@ -495,9 +505,17 @@ def _fit_model(self, i_level, n_jobs_cv=None, store_predictions=True, store_mode
         model.fit(n_jobs_cv=n_jobs_cv, store_predictions=store_predictions, store_models=store_models)
         return model
 
-    def _check_treatment_levels(self):
-        if not np.all(np.isin(self._treatment_levels, np.unique(self._dml_data.d))):
-            raise ValueError('The treatment levels have to be a subset of the unique treatment levels in the data.')
+    def _check_treatment_levels(self, treatment_levels):
+        is_iterable = isinstance(treatment_levels, Iterable)
+        if not is_iterable:
+            treatment_level_list = [treatment_levels]
+        else:
+            treatment_level_list = [t_lvl for t_lvl in treatment_levels]
+        is_d_subset = set(treatment_level_list).issubset(set(np.unique(self._dml_data.d)))
+        if not is_d_subset:
+            raise ValueError('Invalid reference_levels. reference_levels has to be an iterable subset or '
+                             'a single element of the unique treatment levels in the data.')
+        return treatment_level_list
 
     def _check_data(self, obj_dml_data):
         if not isinstance(obj_dml_data, DoubleMLData):
diff --git a/doubleml/irm/tests/test_apos_exceptions.py b/doubleml/irm/tests/test_apos_exceptions.py
index ab123d815..a895f0151 100644
--- a/doubleml/irm/tests/test_apos_exceptions.py
+++ b/doubleml/irm/tests/test_apos_exceptions.py
@@ -31,7 +31,8 @@ def test_apos_exception_data():
         dml_data_z = make_iivm_data()
         _ = DoubleMLAPOS(dml_data_z, ml_g, ml_m, treatment_levels=0)
 
-    msg = 'The treatment levels have to be a subset of the unique treatment levels in the data.'
+    msg = ('Invalid reference_levels. reference_levels has to be an iterable subset or '
+           'a single element of the unique treatment levels in the data.')
     with pytest.raises(ValueError, match=msg):
         _ = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=[1.1])
     with pytest.raises(ValueError, match=msg):
@@ -74,7 +75,18 @@ def test_apos_exception_ipw_normalization():
 
 @pytest.mark.ci
 def test_causal_contrast_exceptions():
-    msg = r"Apply fit() before causal_contrast()."
+    msg = r"Apply fit\(\) before causal_contrast\(\)."
     with pytest.raises(ValueError, match=msg):
-        dml_obj = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=0)
-        dml_obj.causal_contrast()
+        dml_obj = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=[0, 1])
+        dml_obj.causal_contrast(reference_levels=0)
+
+    dml_obj = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=[0, 1])
+    dml_obj.fit()
+    msg = ('Invalid reference_levels. reference_levels has to be an iterable subset of treatment_levels or '
+           'a single treatment level.')
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.causal_contrast(reference_levels=2)
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.causal_contrast(reference_levels=[2])
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.causal_contrast(reference_levels=[0, 2])

From 310d532195e7d4d083313eeb7ab665b38d3eab8b Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Tue, 23 Jul 2024 16:52:57 +0200
Subject: [PATCH 53/98] add causal_contrasts to apos with unit tests

---
 doubleml/irm/apos.py                       | 25 +++++++++++++--------
 doubleml/irm/tests/test_apos.py            | 26 ++++++++++++++++++++++
 doubleml/irm/tests/test_apos_exceptions.py |  6 +++++
 3 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py
index 8efb437c7..88cbb803d 100644
--- a/doubleml/irm/apos.py
+++ b/doubleml/irm/apos.py
@@ -479,6 +479,8 @@ def causal_contrast(self, reference_levels):
 
         if self.framework is None:
             raise ValueError('Apply fit() before causal_contrast().')
+        if self.n_treatment_levels == 1:
+            raise ValueError('Only one treatment level. No causal contrast can be computed.')
         is_iterable = isinstance(reference_levels, Iterable)
         if not is_iterable:
             reference_levels = [reference_levels]
@@ -487,16 +489,21 @@ def causal_contrast(self, reference_levels):
             raise ValueError('Invalid reference_levels. reference_levels has to be an iterable subset of treatment_levels or '
                              'a single treatment level.')
 
+        skip_index = []
+        all_treatment_names = []
+        all_acc_frameworks = []
         for ref_lvl in reference_levels:
-            i_ref_lvl = self.treatment_levels.to
-        i_ref_lvls = self.treatment_levels.tolist().index(reference_levels)
-        reference_framework = self.modellist[i_reference_level].framework
-
-        acc_frameworks = [model.framework - reference_framework for i, model in
-                          enumerate(self.modellist) if i != i_reference_level]
-        acc = concat(acc_frameworks)
-        acc.treatment_names = [f"{self.treatment_levels[i]} vs {reference_levels}" for i in
-                               range(self.n_treatment_levels) if i != i_reference_level]
+            i_ref_lvl = self.treatment_levels.index(ref_lvl)
+            ref_framework = self.modellist[i_ref_lvl].framework
+
+            skip_index += [i_ref_lvl]
+            all_acc_frameworks += [model.framework - ref_framework for i, model in
+                                   enumerate(self.modellist) if i not in skip_index]
+            all_treatment_names += [f"{self.treatment_levels[i]} vs {self.treatment_levels[i_ref_lvl]}" for
+                                    i in range(self.n_treatment_levels) if i not in skip_index]
+
+        acc = concat(all_acc_frameworks)
+        acc.treatment_names = all_treatment_names
         return acc
 
     def _fit_model(self, i_level, n_jobs_cv=None, store_predictions=True, store_models=False):
diff --git a/doubleml/irm/tests/test_apos.py b/doubleml/irm/tests/test_apos.py
index 0d1dc9da1..4ba478a41 100644
--- a/doubleml/irm/tests/test_apos.py
+++ b/doubleml/irm/tests/test_apos.py
@@ -143,6 +143,13 @@ def dml_apos_fixture(learner, n_rep, normalize_ipw, trimming_threshold, treatmen
         res_dict['boot_ci_' + bootstrap] = ci.to_numpy()
         res_dict['boot_ci_' + bootstrap + '_manual'] = ci_manual.to_numpy()
 
+    # causal contrasts
+    if len(treatment_levels) > 1:
+        acc_single = dml_obj.causal_contrast(reference_levels=[treatment_levels[0]])
+        res_dict['causal_contrast_single'] = acc_single
+        acc_multiple = dml_obj.causal_contrast(reference_levels=treatment_levels)
+        res_dict['causal_contrast_multiple'] = acc_multiple
+
     return res_dict
 
 
@@ -198,3 +205,22 @@ def test_doubleml_apos_return_types(dml_apos_fixture):
         dml_apos_fixture['n_rep']
     )
     assert isinstance(dml_apos_fixture['unfitted_apos_model'].summary, pd.DataFrame)
+    if dml_apos_fixture['n_treatment_levels'] > 1:
+        assert isinstance(dml_apos_fixture['causal_contrast_single'], dml.DoubleMLFramework)
+        assert isinstance(dml_apos_fixture['causal_contrast_multiple'], dml.DoubleMLFramework)
+
+
+@pytest.mark.ci
+def test_doubleml_apos_causal_contrast(dml_apos_fixture):
+    if dml_apos_fixture['n_treatment_levels'] == 1:
+        pytest.skip("Skipping test as n_treatment_levels is 1")
+
+    acc_single = dml_apos_fixture['coef'][1:] - dml_apos_fixture['coef'][0]
+    assert np.allclose(dml_apos_fixture['causal_contrast_single'].thetas,
+                       acc_single,
+                       rtol=1e-9, atol=1e-9)
+
+    acc_multiple = np.append(acc_single, dml_apos_fixture['coef'][2] - dml_apos_fixture['coef'][1])
+    assert np.allclose(dml_apos_fixture['causal_contrast_multiple'].thetas,
+                       acc_multiple,
+                       rtol=1e-9, atol=1e-9)
diff --git a/doubleml/irm/tests/test_apos_exceptions.py b/doubleml/irm/tests/test_apos_exceptions.py
index a895f0151..32dc04f21 100644
--- a/doubleml/irm/tests/test_apos_exceptions.py
+++ b/doubleml/irm/tests/test_apos_exceptions.py
@@ -80,6 +80,12 @@ def test_causal_contrast_exceptions():
         dml_obj = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=[0, 1])
         dml_obj.causal_contrast(reference_levels=0)
 
+    msg = 'Only one treatment level. No causal contrast can be computed.'
+    with pytest.raises(ValueError, match=msg):
+        dml_obj = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=[0])
+        dml_obj.fit()
+        dml_obj.causal_contrast(reference_levels=0)
+
     dml_obj = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=[0, 1])
     dml_obj.fit()
     msg = ('Invalid reference_levels. reference_levels has to be an iterable subset of treatment_levels or '

From 3a73ac993e12e22e1a8b4efd051a2ed823324885 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Wed, 24 Jul 2024 16:53:19 +0200
Subject: [PATCH 54/98] update scaling for apos dgp

---
 doubleml/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doubleml/datasets.py b/doubleml/datasets.py
index b3f6d745d..e2f2dcddf 100644
--- a/doubleml/datasets.py
+++ b/doubleml/datasets.py
@@ -1536,7 +1536,7 @@ def f_treatment(w, xi):
         res = xi * (-w[:, 0] + 0.5*w[:, 1] - 0.25*w[:, 2] - 0.1*w[:, 3])
         return res
 
-    def treatment_effect(d, scale=5):
+    def treatment_effect(d, scale=15):
         return scale * (1 / (1 + np.exp(-d - 1.2 * np.cos(d)))) - 2
 
     z_tilde_1 = np.exp(0.5 * x[:, 0])

From 879cfb0cfcec6be687ea700403222748da4c0cf9 Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Thu, 25 Jul 2024 08:21:59 +0200
Subject: [PATCH 55/98] reduce irm settings for unit tests

---
 doubleml/irm/tests/conftest.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/doubleml/irm/tests/conftest.py b/doubleml/irm/tests/conftest.py
index 3f57b4220..6fe207b06 100644
--- a/doubleml/irm/tests/conftest.py
+++ b/doubleml/irm/tests/conftest.py
@@ -14,8 +14,7 @@ def _g(x):
 
 @pytest.fixture(scope='session',
                 params=[(500, 10),
-                        (1000, 20),
-                        (1000, 100)])
+                        (1000, 20)])
 def generate_data_irm(request):
     n_p = request.param
     np.random.seed(1111)

From aaa581ea069f0bbbc325081ad6c555870d6f6c33 Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Thu, 25 Jul 2024 08:25:10 +0200
Subject: [PATCH 56/98] extend weight tests for apo

---
 .../irm/tests/test_apo_weighted_scores.py     | 70 +++++++++++++------
 1 file changed, 47 insertions(+), 23 deletions(-)

diff --git a/doubleml/irm/tests/test_apo_weighted_scores.py b/doubleml/irm/tests/test_apo_weighted_scores.py
index 94d81170c..17fea8a0a 100644
--- a/doubleml/irm/tests/test_apo_weighted_scores.py
+++ b/doubleml/irm/tests/test_apo_weighted_scores.py
@@ -24,6 +24,12 @@ def score(request):
     return request.param
 
 
+@pytest.fixture(scope='module',
+                params=[1, 3])
+def n_rep(request):
+    return request.param
+
+
 @pytest.fixture(scope='module',
                 params=[False, True])
 def normalize_ipw(request):
@@ -43,7 +49,7 @@ def treatment_level(request):
 
 
 @pytest.fixture(scope='module')
-def weighted_apo_score_fixture(generate_data_irm, learner, score, normalize_ipw, trimming_threshold,
+def weighted_apo_score_fixture(generate_data_irm, learner, score, n_rep, normalize_ipw, trimming_threshold,
                                treatment_level):
     n_folds = 2
 
@@ -53,38 +59,44 @@ def weighted_apo_score_fixture(generate_data_irm, learner, score, normalize_ipw,
     all_smpls = draw_smpls(n_obs, n_folds, n_rep=1, groups=d)
     obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d)
 
-    # Set machine learning methods for m & g
-    ml_g = clone(learner[0])
-    ml_m = clone(learner[1])
-
-    np.random.seed(3141)
-    dml_obj = dml.DoubleMLAPO(obj_dml_data,
-                              ml_g, ml_m,
-                              treatment_level,
-                              n_folds,
-                              score=score,
-                              normalize_ipw=normalize_ipw,
-                              trimming_threshold=trimming_threshold,
-                              draw_sample_splitting=False)
+    input_args = {
+        "obj_dml_data": obj_dml_data,
+        "ml_g": clone(learner[0]),
+        "ml_m": clone(learner[1]),
+        "treatment_level": treatment_level,
+        "n_folds": n_folds,
+        "n_rep": n_rep,
+        "score": score,
+        "normalize_ipw": normalize_ipw,
+        "trimming_threshold": trimming_threshold,
+        "draw_sample_splitting": False,
+    }
+
+    np.random.seed(42)
+    dml_obj = dml.DoubleMLAPO(**input_args)
     dml_obj.set_sample_splitting(all_smpls=all_smpls)
     dml_obj.fit()
 
+    np.random.seed(42)
     weights = 0.5 * np.ones_like(obj_dml_data.y)
-    dml_obj_weighted = dml.DoubleMLAPO(obj_dml_data,
-                                       ml_g, ml_m,
-                                       treatment_level,
-                                       n_folds,
-                                       score=score,
-                                       weights=weights,
-                                       normalize_ipw=normalize_ipw,
-                                       trimming_threshold=trimming_threshold,
-                                       draw_sample_splitting=False)
+    dml_obj_weighted = dml.DoubleMLAPO(weights=weights, **input_args)
     dml_obj_weighted.set_sample_splitting(all_smpls=all_smpls)
     dml_obj_weighted.fit()
 
+    np.random.seed(42)
+    weights_dict = {
+        'weights': weights,
+        'weights_bar': np.tile(weights[:, np.newaxis], (1, n_rep)),
+    }
+    dml_obj_weighted_dict = dml.DoubleMLAPO(weights=weights_dict, **input_args)
+    dml_obj_weighted_dict.set_sample_splitting(all_smpls=all_smpls)
+    dml_obj_weighted_dict.fit()
+
     result_dict = {
         'coef': dml_obj.coef,
         'weighted_coef': dml_obj_weighted.coef,
+        'weighted_coef_dict': dml_obj_weighted_dict.coef,
+        'default_weights': dml_obj.weights,
     }
     return result_dict
 
@@ -93,3 +105,15 @@ def weighted_apo_score_fixture(generate_data_irm, learner, score, normalize_ipw,
 def test_apo_weighted_coef(weighted_apo_score_fixture):
     assert np.allclose(0.5 * weighted_apo_score_fixture['coef'],
                        weighted_apo_score_fixture['weighted_coef'])
+    assert np.allclose(0.5 * weighted_apo_score_fixture['coef'],
+                       weighted_apo_score_fixture['weighted_coef_dict'])
+
+
+def test_apo_default_weights(weighted_apo_score_fixture):
+    assert isinstance(weighted_apo_score_fixture['default_weights'], dict)
+
+    expected_keys = {'weights'}
+    assert set(weighted_apo_score_fixture['default_weights'].keys()) == expected_keys
+
+    assert np.allclose(weighted_apo_score_fixture['default_weights']['weights'],
+                       np.ones_like(weighted_apo_score_fixture['default_weights']['weights']))
\ No newline at end of file

From 9cbc9b3bcef784000feea76396936043480ae849 Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Thu, 25 Jul 2024 08:28:32 +0200
Subject: [PATCH 57/98] add pytest mark.ci to weight test

---
 doubleml/irm/tests/test_apo_weighted_scores.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doubleml/irm/tests/test_apo_weighted_scores.py b/doubleml/irm/tests/test_apo_weighted_scores.py
index 17fea8a0a..062bfb94e 100644
--- a/doubleml/irm/tests/test_apo_weighted_scores.py
+++ b/doubleml/irm/tests/test_apo_weighted_scores.py
@@ -109,6 +109,7 @@ def test_apo_weighted_coef(weighted_apo_score_fixture):
                        weighted_apo_score_fixture['weighted_coef_dict'])
 
 
+@pytest.mark.ci
 def test_apo_default_weights(weighted_apo_score_fixture):
     assert isinstance(weighted_apo_score_fixture['default_weights'], dict)
 

From e076f972395351616e8f2b5d20f859e31a451c8b Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Thu, 25 Jul 2024 08:58:25 +0200
Subject: [PATCH 58/98] extend weight tests for apos

---
 doubleml/irm/tests/test_apo_weighted_scores.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doubleml/irm/tests/test_apo_weighted_scores.py b/doubleml/irm/tests/test_apo_weighted_scores.py
index 062bfb94e..5551e5dd0 100644
--- a/doubleml/irm/tests/test_apo_weighted_scores.py
+++ b/doubleml/irm/tests/test_apo_weighted_scores.py
@@ -117,4 +117,4 @@ def test_apo_default_weights(weighted_apo_score_fixture):
     assert set(weighted_apo_score_fixture['default_weights'].keys()) == expected_keys
 
     assert np.allclose(weighted_apo_score_fixture['default_weights']['weights'],
-                       np.ones_like(weighted_apo_score_fixture['default_weights']['weights']))
\ No newline at end of file
+                       np.ones_like(weighted_apo_score_fixture['default_weights']['weights']))

From 64de6007f12d0620b802754f1e49d480ffbf66bc Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Thu, 25 Jul 2024 09:02:39 +0200
Subject: [PATCH 59/98] extend apos weights test

---
 .../irm/tests/test_apos_weighted_scores.py    | 35 +++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/doubleml/irm/tests/test_apos_weighted_scores.py b/doubleml/irm/tests/test_apos_weighted_scores.py
index 84e6ac1c3..3ab8db6af 100644
--- a/doubleml/irm/tests/test_apos_weighted_scores.py
+++ b/doubleml/irm/tests/test_apos_weighted_scores.py
@@ -25,6 +25,12 @@ def score(request):
     return request.param
 
 
+@pytest.fixture(scope='module',
+                params=[1, 3])
+def n_rep(request):
+    return request.param
+
+
 @pytest.fixture(scope='module',
                 params=[False, True])
 def normalize_ipw(request):
@@ -44,7 +50,7 @@ def treatment_levels(request):
 
 
 @pytest.fixture(scope='module')
-def weighted_apos_score_fixture(learner, score, normalize_ipw, trimming_threshold,
+def weighted_apos_score_fixture(learner, score, n_rep, normalize_ipw, trimming_threshold,
                                 treatment_levels):
     n_obs = 500
     n_folds = 2
@@ -67,16 +73,18 @@ def weighted_apos_score_fixture(learner, score, normalize_ipw, trimming_threshol
         'ml_m': clone(learner[1]),
         'treatment_levels': treatment_levels,
         'n_folds': n_folds,
+        'n_rep': n_rep,
         'score': score,
         'normalize_ipw': normalize_ipw,
         'trimming_threshold': trimming_threshold,
         'trimming_rule': 'truncate'
     }
 
-    np.random.seed(3141)
+    np.random.seed(42)
     dml_obj = dml.DoubleMLAPOS(**input_args)
     dml_obj.fit()
 
+    np.random.seed(42)
     weights = 0.5 * np.ones_like(obj_dml_data.y)
     dml_obj_weighted = dml.DoubleMLAPOS(draw_sample_splitting=False,
                                         weights=weights,
@@ -84,9 +92,22 @@ def weighted_apos_score_fixture(learner, score, normalize_ipw, trimming_threshol
     dml_obj_weighted.set_sample_splitting(all_smpls=dml_obj.smpls)
     dml_obj_weighted.fit()
 
+    np.random.seed(42)
+    weights_dict = {
+        'weights': weights,
+        'weights_bar': np.tile(weights[:, np.newaxis], (1, n_rep)),
+    }
+    dml_obj_weighted_dict = dml.DoubleMLAPOS(draw_sample_splitting=False,
+                                             weights=weights_dict,
+                                             **input_args)
+    dml_obj_weighted_dict.set_sample_splitting(all_smpls=dml_obj.smpls)
+    dml_obj_weighted_dict.fit()
+
     result_dict = {
         'coef': dml_obj.coef,
         'weighted_coef': dml_obj_weighted.coef,
+        'weighted_coef_dict': dml_obj_weighted_dict.coef,
+        'default_weights': dml_obj.weights,
     }
     return result_dict
 
@@ -95,3 +116,13 @@ def weighted_apos_score_fixture(learner, score, normalize_ipw, trimming_threshol
 def test_apos_weighted_coef(weighted_apos_score_fixture):
     assert np.allclose(0.5 * weighted_apos_score_fixture['coef'],
                        weighted_apos_score_fixture['weighted_coef'])
+    assert np.allclose(0.5 * weighted_apos_score_fixture['coef'],
+                       weighted_apos_score_fixture['weighted_coef_dict'])
+
+
+@pytest.mark.ci
+def test_apos_default_weights(weighted_apos_score_fixture):
+    assert isinstance(weighted_apos_score_fixture['default_weights'], np.ndarray)
+
+    assert np.allclose(weighted_apos_score_fixture['default_weights'],
+                       np.ones_like(weighted_apos_score_fixture['default_weights']))

From c4f6a05ffa11e1927afedb539531403003d4f464 Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Thu, 25 Jul 2024 10:07:10 +0200
Subject: [PATCH 60/98] remove apply_cross_fitting from apo_manual

---
 doubleml/irm/tests/_utils_apo_manual.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/doubleml/irm/tests/_utils_apo_manual.py b/doubleml/irm/tests/_utils_apo_manual.py
index 862a2793d..e22f80ffe 100644
--- a/doubleml/irm/tests/_utils_apo_manual.py
+++ b/doubleml/irm/tests/_utils_apo_manual.py
@@ -127,21 +127,18 @@ def var_apo(theta, g_hat0, g_hat1, m_hat, u_hat0, u_hat1, treated, score, n_obs)
 
 def boot_apo(y, d, treatment_level, thetas, ses, all_g_hat0, all_g_hat1, all_m_hat,
              all_smpls, score, bootstrap, n_rep_boot,
-             n_rep=1, apply_cross_fitting=True, normalize_ipw=True):
+             n_rep=1, normalize_ipw=True):
     treated = (d == treatment_level)
     all_boot_t_stat = list()
     for i_rep in range(n_rep):
         smpls = all_smpls[i_rep]
-        if apply_cross_fitting:
-            n_obs = len(y)
-        else:
-            test_index = smpls[0][1]
-            n_obs = len(test_index)
+        n_obs = len(y)
+
         weights = draw_weights(bootstrap, n_rep_boot, n_obs)
         boot_t_stat = boot_apo_single_split(
             thetas[i_rep], y, d, treated,
             all_g_hat0[i_rep], all_g_hat1[i_rep], all_m_hat[i_rep], smpls,
-            score, ses[i_rep], weights, n_rep_boot, apply_cross_fitting, normalize_ipw)
+            score, ses[i_rep], weights, n_rep_boot, normalize_ipw)
         all_boot_t_stat.append(boot_t_stat)
 
     boot_t_stat = np.hstack(all_boot_t_stat)
@@ -150,7 +147,7 @@ def boot_apo(y, d, treatment_level, thetas, ses, all_g_hat0, all_g_hat1, all_m_h
 
 
 def boot_apo_single_split(theta, y, d, treated, g_hat0_list, g_hat1_list, m_hat_list,
-                          smpls, score, se, weights, n_rep_boot, apply_cross_fitting, normalize_ipw):
+                          smpls, score, se, weights, n_rep_boot, normalize_ipw):
     _, u_hat1, _, g_hat1, m_hat = compute_residuals(
         y, g_hat0_list, g_hat1_list, m_hat_list, smpls)
 
@@ -161,7 +158,7 @@ def boot_apo_single_split(theta, y, d, treated, g_hat0_list, g_hat1_list, m_hat_
 
     J = -1.0
     psi = g_hat1 + np.divide(np.multiply(treated, u_hat1), m_hat_adj) - theta
-    boot_t_stat = boot_manual(psi, J, smpls, se, weights, n_rep_boot, apply_cross_fitting)
+    boot_t_stat = boot_manual(psi, J, smpls, se, weights, n_rep_boot)
 
     return boot_t_stat
 

From 705afb8f61dd24e8a61edb058a025a4cd5129da0 Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Thu, 25 Jul 2024 10:14:00 +0200
Subject: [PATCH 61/98] add test for classifier without binary outcome in apo

---
 doubleml/irm/tests/test_apo_exceptions.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/doubleml/irm/tests/test_apo_exceptions.py b/doubleml/irm/tests/test_apo_exceptions.py
index 2e9a7a1bd..ce6d4fd79 100644
--- a/doubleml/irm/tests/test_apo_exceptions.py
+++ b/doubleml/irm/tests/test_apo_exceptions.py
@@ -47,6 +47,15 @@ def test_apo_exception_data():
         _ = DoubleMLAPO(dml_data_warn, ml_g, ml_m, treatment_level=42)
 
 
+@pytest.mark.ci
+def test_apo_exception_learner():
+    msg = (r'The ml_g learner LogisticRegression\(\) was identified as classifier but the outcome variable is not'
+           ' binary with values 0 and 1.')
+    with pytest.raises(ValueError, match=msg):
+        ml_g_classifier = LogisticRegression()
+        _ = DoubleMLAPO(dml_data, ml_g_classifier, ml_m, treatment_level=0)
+
+
 @pytest.mark.ci
 def test_apo_exception_scores():
     msg = 'Invalid score MAR. Valid score APO.'

From 8b318154cd970c8172ab8e415a52b6eb9a280649 Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Thu, 25 Jul 2024 12:04:57 +0200
Subject: [PATCH 62/98] Add exception for classfier in DoubleMLAPOS class

---
 doubleml/irm/apos.py                       | 12 +++++++++++-
 doubleml/irm/tests/test_apos_exceptions.py |  9 +++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py
index 88cbb803d..95c909dfa 100644
--- a/doubleml/irm/apos.py
+++ b/doubleml/irm/apos.py
@@ -6,6 +6,7 @@
 
 from joblib import Parallel, delayed
 
+from ..double_ml import DoubleML
 from ..double_ml_data import DoubleMLData, DoubleMLClusterData
 from .apo import DoubleMLAPO
 from ..double_ml_framework import concat
@@ -60,8 +61,17 @@ def __init__(self,
             raise TypeError('Normalization indicator has to be boolean. ' +
                             f'Object of type {str(type(self.normalize_ipw))} passed.')
 
+        ml_g_is_classifier = DoubleML._check_learner(ml_g, 'ml_g', regressor=True, classifier=True)
+        _ = DoubleML._check_learner(ml_m, 'ml_m', regressor=False, classifier=True)
         self._learner = {'ml_g': clone(ml_g), 'ml_m': clone(ml_m)}
-        self._predict_method = {'ml_g': 'predict', 'ml_m': 'predict_proba'}
+        if ml_g_is_classifier:
+            if obj_dml_data.binary_outcome:
+                self._predict_method = {'ml_g': 'predict_proba', 'ml_m': 'predict_proba'}
+            else:
+                raise ValueError(f'The ml_g learner {str(ml_g)} was identified as classifier '
+                                 'but the outcome variable is not binary with values 0 and 1.')
+        else:
+            self._predict_method = {'ml_g': 'predict', 'ml_m': 'predict_proba'}
 
         # APO weights
         _check_weights(weights, score="ATE", n_obs=obj_dml_data.n_obs, n_rep=self.n_rep)
diff --git a/doubleml/irm/tests/test_apos_exceptions.py b/doubleml/irm/tests/test_apos_exceptions.py
index 32dc04f21..e722832b4 100644
--- a/doubleml/irm/tests/test_apos_exceptions.py
+++ b/doubleml/irm/tests/test_apos_exceptions.py
@@ -41,6 +41,15 @@ def test_apos_exception_data():
         _ = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=[1, 2.2])
 
 
+@pytest.mark.ci
+def test_apos_exception_learner():
+    msg = (r'The ml_g learner LogisticRegression\(\) was identified as classifier but the outcome variable is not'
+           ' binary with values 0 and 1.')
+    with pytest.raises(ValueError, match=msg):
+        ml_g_classifier = LogisticRegression()
+        _ = DoubleMLAPOS(dml_data, ml_g_classifier, ml_m, treatment_level=0)
+
+
 @pytest.mark.ci
 def test_apos_exception_scores():
     msg = 'Invalid score MAR. Valid score APO.'

From 9d9b7fd709818da003f04ecd9f637feb9dcb3b0e Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Thu, 25 Jul 2024 12:11:43 +0200
Subject: [PATCH 63/98] Update test_apos_exceptions.py

---
 doubleml/irm/tests/test_apos_exceptions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doubleml/irm/tests/test_apos_exceptions.py b/doubleml/irm/tests/test_apos_exceptions.py
index e722832b4..058db5f72 100644
--- a/doubleml/irm/tests/test_apos_exceptions.py
+++ b/doubleml/irm/tests/test_apos_exceptions.py
@@ -47,7 +47,7 @@ def test_apos_exception_learner():
            ' binary with values 0 and 1.')
     with pytest.raises(ValueError, match=msg):
         ml_g_classifier = LogisticRegression()
-        _ = DoubleMLAPOS(dml_data, ml_g_classifier, ml_m, treatment_level=0)
+        _ = DoubleMLAPOS(dml_data, ml_g_classifier, ml_m, treatment_levels=0)
 
 
 @pytest.mark.ci

From 0daf406ba639cd3d5cff440cb992d1a7a7716db7 Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Thu, 25 Jul 2024 15:22:55 +0200
Subject: [PATCH 64/98] add seperate function for binary outcome check

---
 doubleml/irm/irm.py               | 18 +++---------------
 doubleml/tests/test_exceptions.py |  2 +-
 doubleml/utils/_checks.py         | 10 ++++++++++
 3 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/doubleml/irm/irm.py b/doubleml/irm/irm.py
index 3cf98ec36..1b1695c66 100644
--- a/doubleml/irm/irm.py
+++ b/doubleml/irm/irm.py
@@ -13,7 +13,7 @@
 
 from ..utils._estimation import _dml_cv_predict, _get_cond_smpls, _dml_tune, _trimm, _normalize_ipw, _cond_targets
 from ..utils._checks import _check_score, _check_trimming, _check_finite_predictions, _check_is_propensity, _check_integer, \
-    _check_weights
+    _check_weights, _check_binary_predictions
 
 
 class DoubleMLIRM(LinearScoreMixin, DoubleML):
@@ -275,13 +275,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
             g_hat0['targets'] = _cond_targets(g_hat0['targets'], cond_sample=(d == 0))
 
             if self._dml_data.binary_outcome:
-                binary_preds = (type_of_target(g_hat0['preds']) == 'binary')
-                zero_one_preds = np.all((np.power(g_hat0['preds'], 2) - g_hat0['preds']) == 0)
-                if binary_preds & zero_one_preds:
-                    raise ValueError(f'For the binary outcome variable {self._dml_data.y_col}, '
-                                     f'predictions obtained with the ml_g learner {str(self._learner["ml_g"])} are also '
-                                     'observed to be binary with values 0 and 1. Make sure that for classifiers '
-                                     'probabilities and not labels are predicted.')
+                _check_binary_predictions(g_hat0['preds'], self._learner['ml_g'], 'ml_g', self._dml_data.y_col)
 
         if g1_external:
             # use external predictions
@@ -297,13 +291,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
             g_hat1['targets'] = _cond_targets(g_hat1['targets'], cond_sample=(d == 1))
 
         if self._dml_data.binary_outcome & (self.score != 'ATTE'):
-            binary_preds = (type_of_target(g_hat1['preds']) == 'binary')
-            zero_one_preds = np.all((np.power(g_hat1['preds'], 2) - g_hat1['preds']) == 0)
-            if binary_preds & zero_one_preds:
-                raise ValueError(f'For the binary outcome variable {self._dml_data.y_col}, '
-                                 f'predictions obtained with the ml_g learner {str(self._learner["ml_g"])} are also '
-                                 'observed to be binary with values 0 and 1. Make sure that for classifiers '
-                                 'probabilities and not labels are predicted.')
+            _check_binary_predictions(g_hat1['preds'], self._learner['ml_g'], 'ml_g', self._dml_data.y_col)
 
         # nuisance m
         if m_external:
diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py
index 215c0a088..8d0e74070 100644
--- a/doubleml/tests/test_exceptions.py
+++ b/doubleml/tests/test_exceptions.py
@@ -966,7 +966,7 @@ def test_doubleml_exception_learner():
     with pytest.warns(UserWarning, match=msg):
         dml_irm_hidden_classifier = DoubleMLIRM(dml_data_irm_binary_outcome,
                                                 log_reg, LogisticRegression())
-    msg = (r'For the binary outcome variable y, predictions obtained with the ml_g learner '
+    msg = (r'For the binary variable y, predictions obtained with the ml_g learner '
            r'LogisticRegressionManipulatedPredict\(\) are also observed to be binary with values 0 and 1. Make sure '
            'that for classifiers probabilities and not labels are predicted.')
     with pytest.raises(ValueError, match=msg):
diff --git a/doubleml/utils/_checks.py b/doubleml/utils/_checks.py
index e54c4041b..d7d2881ed 100644
--- a/doubleml/utils/_checks.py
+++ b/doubleml/utils/_checks.py
@@ -206,6 +206,16 @@ def _check_is_propensity(preds, learner, learner_name, smpls, eps=1e-12):
     return
 
 
+def _check_binary_predictions(pred, learner, learner_name, variable_name):
+    binary_preds = (type_of_target(pred) == 'binary')
+    zero_one_preds = np.all((np.power(pred, 2) - pred) == 0)
+    if binary_preds & zero_one_preds:
+        raise ValueError(f'For the binary variable {variable_name}, '
+                         f'predictions obtained with the {learner_name} learner {str(learner)} are also '
+                         'observed to be binary with values 0 and 1. Make sure that for classifiers '
+                         'probabilities and not labels are predicted.')
+
+
 def _check_benchmarks(benchmarks):
     if benchmarks is not None:
         if not isinstance(benchmarks, dict):

From b9d8ff61811f0b751b4d9753e40206696b0e200d Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Thu, 25 Jul 2024 15:25:58 +0200
Subject: [PATCH 65/98] update binary outcome check iivm

---
 doubleml/irm/iivm.py              | 22 +++++-----------------
 doubleml/tests/test_exceptions.py |  2 +-
 2 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/doubleml/irm/iivm.py b/doubleml/irm/iivm.py
index c131449bb..c2f85dd4d 100644
--- a/doubleml/irm/iivm.py
+++ b/doubleml/irm/iivm.py
@@ -7,7 +7,8 @@
 from ..double_ml_score_mixins import LinearScoreMixin
 
 from ..utils._estimation import _dml_cv_predict, _get_cond_smpls, _dml_tune, _trimm, _normalize_ipw
-from ..utils._checks import _check_score, _check_trimming, _check_finite_predictions, _check_is_propensity
+from ..utils._checks import _check_score, _check_trimming, _check_finite_predictions, _check_is_propensity, \
+    _check_binary_predictions
 
 
 class DoubleMLIIVM(LinearScoreMixin, DoubleML):
@@ -264,15 +265,9 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
             g_hat0['targets'][z == 1] = np.nan
 
         if self._dml_data.binary_outcome:
-            binary_preds = (type_of_target(g_hat0['preds']) == 'binary')
-            zero_one_preds = np.all((np.power(g_hat0['preds'], 2) - g_hat0['preds']) == 0)
-            if binary_preds & zero_one_preds:
-                raise ValueError(f'For the binary outcome variable {self._dml_data.y_col}, '
-                                 f'predictions obtained with the ml_g learner {str(self._learner["ml_g"])} are also '
-                                 'observed to be binary with values 0 and 1. Make sure that for classifiers '
-                                 'probabilities and not labels are predicted.')
-
+            _check_binary_predictions(g_hat0['preds'], self._learner['ml_g'], 'ml_g', self._dml_data.y_col)
             _check_is_propensity(g_hat0['preds'], self._learner['ml_g'], 'ml_g', smpls, eps=1e-12)
+
         if external_predictions['ml_g1'] is not None:
             g_hat1 = {'preds': external_predictions['ml_g1'],
                       'targets': None,
@@ -287,14 +282,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
             g_hat1['targets'][z == 0] = np.nan
 
         if self._dml_data.binary_outcome:
-            binary_preds = (type_of_target(g_hat1['preds']) == 'binary')
-            zero_one_preds = np.all((np.power(g_hat1['preds'], 2) - g_hat1['preds']) == 0)
-            if binary_preds & zero_one_preds:
-                raise ValueError(f'For the binary outcome variable {self._dml_data.y_col}, '
-                                 f'predictions obtained with the ml_g learner {str(self._learner["ml_g"])} are also '
-                                 'observed to be binary with values 0 and 1. Make sure that for classifiers '
-                                 'probabilities and not labels are predicted.')
-
+            _check_binary_predictions(g_hat1['preds'], self._learner['ml_g'], 'ml_g', self._dml_data.y_col)
             _check_is_propensity(g_hat1['preds'], self._learner['ml_g'], 'ml_g', smpls, eps=1e-12)
 
         # nuisance m
diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py
index 8d0e74070..1dc23dfb5 100644
--- a/doubleml/tests/test_exceptions.py
+++ b/doubleml/tests/test_exceptions.py
@@ -980,7 +980,7 @@ def test_doubleml_exception_learner():
     with pytest.warns(UserWarning, match=msg):
         dml_iivm_hidden_classifier = DoubleMLIIVM(dml_data_iivm_binary_outcome,
                                                   log_reg, LogisticRegression(), LogisticRegression())
-    msg = (r'For the binary outcome variable y, predictions obtained with the ml_g learner '
+    msg = (r'For the binary variable y, predictions obtained with the ml_g learner '
            r'LogisticRegressionManipulatedPredict\(\) are also observed to be binary with values 0 and 1. Make sure '
            'that for classifiers probabilities and not labels are predicted.')
     with pytest.raises(ValueError, match=msg):

From ffa77cf0bc5702d51fc0eaf0f62a6f17c1d0f929 Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Thu, 25 Jul 2024 15:28:34 +0200
Subject: [PATCH 66/98] update binary treatment check plr

---
 doubleml/plm/plr.py               | 10 ++--------
 doubleml/tests/test_exceptions.py |  2 +-
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/doubleml/plm/plr.py b/doubleml/plm/plr.py
index 3b7d90f5b..fd9d78ae9 100644
--- a/doubleml/plm/plr.py
+++ b/doubleml/plm/plr.py
@@ -12,7 +12,7 @@
 from ..utils.blp import DoubleMLBLP
 
 from ..utils._estimation import _dml_cv_predict, _dml_tune
-from ..utils._checks import _check_score, _check_finite_predictions, _check_is_propensity
+from ..utils._checks import _check_score, _check_finite_predictions, _check_is_propensity, _check_binary_predictions
 
 
 class DoubleMLPLR(LinearScoreMixin, DoubleML):
@@ -198,13 +198,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
             _check_is_propensity(m_hat['preds'], self._learner['ml_m'], 'ml_m', smpls, eps=1e-12)
 
         if self._dml_data.binary_treats[self._dml_data.d_cols[self._i_treat]]:
-            binary_preds = (type_of_target(m_hat['preds']) == 'binary')
-            zero_one_preds = np.all((np.power(m_hat['preds'], 2) - m_hat['preds']) == 0)
-            if binary_preds & zero_one_preds:
-                raise ValueError(f'For the binary treatment variable {self._dml_data.d_cols[self._i_treat]}, '
-                                 f'predictions obtained with the ml_m learner {str(self._learner["ml_m"])} are also '
-                                 'observed to be binary with values 0 and 1. Make sure that for classifiers '
-                                 'probabilities and not labels are predicted.')
+            _check_binary_predictions(m_hat['preds'], self._learner['ml_m'], 'ml_m', self._dml_data.d_cols[self._i_treat])
 
         # an estimate of g is obtained for the IV-type score and callable scores
         g_hat = {'preds': None, 'targets': None, 'models': None}
diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py
index 1dc23dfb5..3ba85e167 100644
--- a/doubleml/tests/test_exceptions.py
+++ b/doubleml/tests/test_exceptions.py
@@ -950,7 +950,7 @@ def test_doubleml_exception_learner():
            'nor a classifier. Method predict is used for prediction.')
     with pytest.warns(UserWarning, match=msg):
         dml_plr_hidden_classifier = DoubleMLPLR(dml_data_irm, Lasso(), log_reg)
-    msg = (r'For the binary treatment variable d, predictions obtained with the ml_m learner LogisticRegression\(\) '
+    msg = (r'For the binary variable d, predictions obtained with the ml_m learner LogisticRegression\(\) '
            'are also observed to be binary with values 0 and 1. Make sure that for classifiers probabilities and not '
            'labels are predicted.')
     with pytest.raises(ValueError, match=msg):

From d66658c7cace6992a2080b58107fca0edb469534 Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Thu, 25 Jul 2024 15:28:48 +0200
Subject: [PATCH 67/98] Update plr.py

---
 doubleml/plm/plr.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doubleml/plm/plr.py b/doubleml/plm/plr.py
index fd9d78ae9..d5810b972 100644
--- a/doubleml/plm/plr.py
+++ b/doubleml/plm/plr.py
@@ -1,7 +1,6 @@
 import numpy as np
 import pandas as pd
 from sklearn.utils import check_X_y
-from sklearn.utils.multiclass import type_of_target
 from sklearn.base import clone
 
 import warnings

From 50d8b2dee53bcb7c157153a4801f3d73b869e044 Mon Sep 17 00:00:00 2001
From: Sven1704 <sven.klaassen@uni-hamburg.de>
Date: Thu, 25 Jul 2024 15:30:28 +0200
Subject: [PATCH 68/98] update binary outcome check apo

---
 doubleml/irm/apo.py | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/doubleml/irm/apo.py b/doubleml/irm/apo.py
index 838e55460..13aae5c32 100644
--- a/doubleml/irm/apo.py
+++ b/doubleml/irm/apo.py
@@ -3,7 +3,6 @@
 import warnings
 
 from sklearn.utils import check_X_y
-from sklearn.utils.multiclass import type_of_target
 
 from ..double_ml import DoubleML
 
@@ -14,7 +13,7 @@
 from ..utils._estimation import _dml_cv_predict, _dml_tune, _get_cond_smpls, _cond_targets, _trimm, \
     _normalize_ipw
 from ..utils._checks import _check_score, _check_trimming, _check_weights, _check_finite_predictions, \
-    _check_is_propensity
+    _check_is_propensity, _check_binary_predictions
 
 
 class DoubleMLAPO(LinearScoreMixin, DoubleML):
@@ -230,13 +229,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
             g_hat0['targets'] = _cond_targets(g_hat0['targets'], cond_sample=(treated == 0))
 
         if self._dml_data.binary_outcome:
-            binary_preds = (type_of_target(g_hat0['preds']) == 'binary')
-            zero_one_preds = np.all((np.power(g_hat0['preds'], 2) - g_hat0['preds']) == 0)
-            if binary_preds & zero_one_preds:
-                raise ValueError(f'For the binary outcome variable {self._dml_data.y_col}, '
-                                 f'predictions obtained with the ml_g learner {str(self._learner["ml_g"])} are also '
-                                 'observed to be binary with values 0 and 1. Make sure that for classifiers '
-                                 'probabilities and not labels are predicted.')
+            _check_binary_predictions(g_hat0['preds'], self._learner['ml_g'], 'ml_g', self._dml_data.y_col)
 
         if g1_external:
             # use external predictions
@@ -252,13 +245,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
             g_hat1['targets'] = _cond_targets(g_hat1['targets'], cond_sample=(treated == 1))
 
         if self._dml_data.binary_outcome:
-            binary_preds = (type_of_target(g_hat1['preds']) == 'binary')
-            zero_one_preds = np.all((np.power(g_hat1['preds'], 2) - g_hat1['preds']) == 0)
-            if binary_preds & zero_one_preds:
-                raise ValueError(f'For the binary outcome variable {self._dml_data.y_col}, '
-                                 f'predictions obtained with the ml_g learner {str(self._learner["ml_g"])} are also '
-                                 'observed to be binary with values 0 and 1. Make sure that for classifiers '
-                                 'probabilities and not labels are predicted.')
+            _check_binary_predictions(g_hat1['preds'], self._learner['ml_g'], 'ml_g', self._dml_data.y_col)
 
         # nuisance m
         if m_external:

From 0bfda41a4748ae81fd364c3ac0525c315b307da3 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Thu, 25 Jul 2024 17:33:12 +0200
Subject: [PATCH 69/98] adjust check data for APO

---
 doubleml/irm/apo.py               |  4 ----
 doubleml/tests/_utils.py          | 11 +++++++++++
 doubleml/tests/test_exceptions.py | 13 ++-----------
 3 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/doubleml/irm/apo.py b/doubleml/irm/apo.py
index 13aae5c32..f52750709 100644
--- a/doubleml/irm/apo.py
+++ b/doubleml/irm/apo.py
@@ -8,7 +8,6 @@
 
 from ..utils.blp import DoubleMLBLP
 from ..double_ml_score_mixins import LinearScoreMixin
-from ..double_ml_data import DoubleMLData
 
 from ..utils._estimation import _dml_cv_predict, _dml_tune, _get_cond_smpls, _cond_targets, _trimm, \
     _normalize_ipw
@@ -368,9 +367,6 @@ def _nuisance_tuning(self, smpls, param_grids, scoring_methods, n_folds_tune, n_
         return res
 
     def _check_data(self, obj_dml_data):
-        if not isinstance(obj_dml_data, DoubleMLData):
-            raise TypeError('The data must be of DoubleMLData type. '
-                            f'{str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed.')
         if obj_dml_data.z_cols is not None:
             raise ValueError('Incompatible data. ' +
                              ' and '.join(obj_dml_data.z_cols) +
diff --git a/doubleml/tests/_utils.py b/doubleml/tests/_utils.py
index fb85b2410..18ceef883 100644
--- a/doubleml/tests/_utils.py
+++ b/doubleml/tests/_utils.py
@@ -5,6 +5,17 @@
 from scipy.stats import norm
 
 from ..utils._estimation import _var_est, _aggregate_coefs_and_ses
+from ..double_ml_data import DoubleMLBaseData
+
+
+class DummyDataClass(DoubleMLBaseData):
+    def __init__(self,
+                 data):
+        DoubleMLBaseData.__init__(self, data)
+
+    @property
+    def n_coefs(self):
+        return 1
 
 
 def draw_smpls(n_obs, n_folds, n_rep=1, groups=None):
diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py
index 3ba85e167..a694d807f 100644
--- a/doubleml/tests/test_exceptions.py
+++ b/doubleml/tests/test_exceptions.py
@@ -8,7 +8,8 @@
     DoubleMLDIDCS, DoubleMLBLP
 from doubleml.datasets import make_plr_CCDDHNR2018, make_irm_data, make_pliv_CHS2015, make_iivm_data, \
     make_pliv_multiway_cluster_CKMS2021, make_did_SZ2020
-from doubleml.double_ml_data import DoubleMLBaseData
+
+from ._utils import DummyDataClass
 
 from sklearn.linear_model import Lasso, LogisticRegression
 from sklearn.base import BaseEstimator
@@ -38,16 +39,6 @@
 dml_data_iivm_binary_outcome = DoubleMLData.from_arrays(x, y, d, z)
 
 
-class DummyDataClass(DoubleMLBaseData):
-    def __init__(self,
-                 data):
-        DoubleMLBaseData.__init__(self, data)
-
-    @property
-    def n_coefs(self):
-        return 1
-
-
 @pytest.mark.ci
 def test_doubleml_exception_data():
     msg = 'The data must be of DoubleMLData or DoubleMLClusterData type.'

From 4189df586aa239290273e6e4f1ecb6b5377f65dc Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Thu, 25 Jul 2024 17:56:09 +0200
Subject: [PATCH 70/98] add apo and gapo exception tests

---
 doubleml/irm/tests/test_apo_exceptions.py | 51 ++++++++++++++++++++++-
 1 file changed, 50 insertions(+), 1 deletion(-)

diff --git a/doubleml/irm/tests/test_apo_exceptions.py b/doubleml/irm/tests/test_apo_exceptions.py
index ce6d4fd79..970ba1900 100644
--- a/doubleml/irm/tests/test_apo_exceptions.py
+++ b/doubleml/irm/tests/test_apo_exceptions.py
@@ -3,9 +3,10 @@
 import numpy as np
 
 from doubleml import DoubleMLAPO, DoubleMLData
-from doubleml.datasets import make_irm_data_discrete_treatments, make_iivm_data
+from doubleml.datasets import make_irm_data_discrete_treatments, make_iivm_data, make_irm_data
 
 from sklearn.linear_model import Lasso, LogisticRegression
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 
 n = 100
 data_apo = make_irm_data_discrete_treatments(n_obs=n)
@@ -142,3 +143,51 @@ def test_apo_exception_weights():
         _ = DoubleMLAPO(dml_data, ml_g, ml_m, treatment_level=0,
                         weights={'weights': np.ones((dml_data.d.shape[0], )),
                                  'weights_bar': np.zeros((dml_data.d.shape[0], 1))})
+
+
+@pytest.mark.ci
+def test_apo_exception_capo_gapo():
+    n = 20
+    # collect data
+    np.random.seed(42)
+    obj_dml_data = make_irm_data(n_obs=n, dim_x=2)
+
+    # First stage estimation
+    ml_g = RandomForestRegressor(n_estimators=10)
+    ml_m = RandomForestClassifier(n_estimators=10)
+
+    dml_obj = DoubleMLAPO(obj_dml_data,
+                          ml_m=ml_m,
+                          ml_g=ml_g,
+                          treatment_level=0)
+
+    dml_obj.fit()
+    # create a random basis
+    random_basis = pd.DataFrame(np.random.normal(0, 1, size=(n, 5)))
+
+    msg = "Invalid score APO_2. Valid score APO."
+    with pytest.raises(ValueError, match=msg):
+        dml_obj._score = 'APO_2'
+        _ = dml_obj.capo(random_basis)
+    # reset the score
+    dml_obj._score = 'APO'
+
+    msg = "Only implemented for one repetition. Number of repetitions is 2."
+    with pytest.raises(NotImplementedError, match=msg):
+        dml_obj._n_rep = 2
+        dml_obj.capo(random_basis)
+    # reset the number of repetitions
+    dml_obj._n_rep = 1
+
+    msg = "Groups must be of DataFrame type. Groups of type <class 'int'> was passed."
+    with pytest.raises(TypeError, match=msg):
+        _ = dml_obj.gapo(1)
+
+    groups_1 = pd.DataFrame(
+        np.column_stack([obj_dml_data.data['X1'] > 0.2, np.ones_like(obj_dml_data.data['X1'])]),
+        columns=['Group 1', 'Group 2']
+    )
+    msg = (r'Columns of groups must be of bool type or int type \(dummy coded\). Alternatively,'
+           ' groups should only contain one column.')
+    with pytest.raises(TypeError, match=msg):
+        _ = dml_obj.gapo(groups_1)

From 73a164f32be7aadb56b623666463d26bb5c27ee1 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Thu, 25 Jul 2024 18:08:35 +0200
Subject: [PATCH 71/98] add methods exception tests for apos

---
 doubleml/irm/tests/test_apos_exceptions.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/doubleml/irm/tests/test_apos_exceptions.py b/doubleml/irm/tests/test_apos_exceptions.py
index 058db5f72..3e60bba83 100644
--- a/doubleml/irm/tests/test_apos_exceptions.py
+++ b/doubleml/irm/tests/test_apos_exceptions.py
@@ -82,6 +82,24 @@ def test_apos_exception_ipw_normalization():
         _ = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=0, normalize_ipw=1)
 
 
+@pytest.mark.ci
+def test_apos_exception_properties_and_methods():
+    # properties
+    dml_obj = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=0, draw_sample_splitting=False)
+    msg = r'Sample splitting not specified. Draw samples via .draw_sample splitting\(\). External samples not implemented yet.'
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.smpls
+
+    # methods
+    dml_obj = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=0)
+    msg = r'Apply fit\(\) before confint\(\).'
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.confint()
+    msg = r'Apply fit\(\) before bootstrap\(\).'
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.bootstrap()
+
+
 @pytest.mark.ci
 def test_causal_contrast_exceptions():
     msg = r"Apply fit\(\) before causal_contrast\(\)."

From 03d8636c023ff9afa7be98d90077b0b21c3f97ac Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Thu, 25 Jul 2024 19:59:35 +0200
Subject: [PATCH 72/98] add property tests for DoubleMLAPOS

---
 doubleml/irm/apos.py            | 12 +++++++--
 doubleml/irm/tests/test_apos.py | 45 ++++++++++++++++++++++++++++++++-
 2 files changed, 54 insertions(+), 3 deletions(-)

diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py
index 95c909dfa..c0fabe39b 100644
--- a/doubleml/irm/apos.py
+++ b/doubleml/irm/apos.py
@@ -229,14 +229,22 @@ def t_stat(self):
         """
         t-statistics for the causal parameter(s) after calling :meth:`fit` (shape (``n_treatment_levels``,)).
         """
-        return self.framework.t_stats
+        if self._framework is None:
+            t_stats = None
+        else:
+            t_stats = self.framework.t_stats
+        return t_stats
 
     @property
     def pval(self):
         """
         p-values for the causal parameter(s) (shape (``n_treatment_levels``,)).
         """
-        return self.framework.pvals
+        if self._framework is None:
+            pvals = None
+        else:
+            pvals = self.framework.pvals
+        return pvals
 
     @property
     def smpls(self):
diff --git a/doubleml/irm/tests/test_apos.py b/doubleml/irm/tests/test_apos.py
index 4ba478a41..6f8da9b90 100644
--- a/doubleml/irm/tests/test_apos.py
+++ b/doubleml/irm/tests/test_apos.py
@@ -8,12 +8,55 @@
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 
 import doubleml as dml
-from doubleml.datasets import make_irm_data_discrete_treatments
+from doubleml.datasets import make_irm_data_discrete_treatments, make_irm_data
 
 from ._utils_apos_manual import fit_apos, boot_apos
 from ...tests._utils import confint_manual
 
 
+@pytest.mark.ci
+def test_apo_properties():
+    n = 20
+    # collect data
+    np.random.seed(42)
+    obj_dml_data = make_irm_data(n_obs=n, dim_x=2)
+
+    dml_obj = dml.DoubleMLAPOS(obj_dml_data,
+                               ml_g=RandomForestRegressor(n_estimators=10),
+                               ml_m=RandomForestClassifier(n_estimators=10),
+                               treatment_levels=0)
+
+    # check properties before fit
+    assert dml_obj.n_rep_boot is None
+    assert dml_obj.coef is None
+    assert dml_obj.all_coef is None
+    assert dml_obj.se is None
+    assert dml_obj.all_se is None
+    assert dml_obj.t_stat is None
+    assert dml_obj.pval is None
+    assert dml_obj.n_rep_boot is None
+    assert dml_obj.boot_t_stat is None
+    assert dml_obj.boot_method is None
+
+    # check properties after fit
+    dml_obj.fit()
+    assert dml_obj.coef is not None
+    assert dml_obj.all_coef is not None
+    assert dml_obj.se is not None
+    assert dml_obj.all_se is not None
+    assert dml_obj.t_stat is not None
+    assert dml_obj.pval is not None
+    assert dml_obj.n_rep_boot is None
+    assert dml_obj.boot_t_stat is None
+    assert dml_obj.boot_method is None
+
+    # check properties after bootstrap
+    dml_obj.bootstrap()
+    assert dml_obj.n_rep_boot is not None
+    assert dml_obj.boot_t_stat is not None
+    assert dml_obj.boot_method is not None
+
+
 @pytest.fixture(scope='module',
                 params=[[LinearRegression(),
                          LogisticRegression(solver='lbfgs', max_iter=250, random_state=42)],

From 039901dcd978e65861574a307cb176af632ee4fe Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Fri, 26 Jul 2024 07:44:34 +0200
Subject: [PATCH 73/98] fix exception test apos

---
 doubleml/irm/tests/test_apos_exceptions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doubleml/irm/tests/test_apos_exceptions.py b/doubleml/irm/tests/test_apos_exceptions.py
index 3e60bba83..e9d972f13 100644
--- a/doubleml/irm/tests/test_apos_exceptions.py
+++ b/doubleml/irm/tests/test_apos_exceptions.py
@@ -88,7 +88,7 @@ def test_apos_exception_properties_and_methods():
     dml_obj = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=0, draw_sample_splitting=False)
     msg = r'Sample splitting not specified. Draw samples via .draw_sample splitting\(\). External samples not implemented yet.'
     with pytest.raises(ValueError, match=msg):
-        dml_obj.smpls
+        _ = dml_obj.smpls
 
     # methods
     dml_obj = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=0)

From 953cbef49a7b71382e55c60daaf831f4029b5d66 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Fri, 26 Jul 2024 16:48:58 +0200
Subject: [PATCH 74/98] add exception for framework with sensitivity analysis

---
 doubleml/tests/test_exceptions.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py
index a694d807f..b8b690f34 100644
--- a/doubleml/tests/test_exceptions.py
+++ b/doubleml/tests/test_exceptions.py
@@ -260,6 +260,14 @@ def test_doubleml_exception_data():
                           Lasso(), LogisticRegression())
 
 
+@pytest.mark.ci
+def test_doubleml_exception_framework():
+    msg = r'Apply fit\(\) before sensitivity_analysis\(\).'
+    with pytest.raises(ValueError, match=msg):
+        dml_obj = DoubleMLPLR(dml_data, ml_l, ml_m)
+        dml_obj.sensitivity_analysis()
+
+
 @pytest.mark.ci
 def test_doubleml_exception_scores():
     # PLR

From a367b1e91f0c9ac4b3522482ed5634cb22e06898 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Fri, 26 Jul 2024 17:17:17 +0200
Subject: [PATCH 75/98] update default test for doubleml

---
 doubleml/tests/test_model_defaults.py | 63 ++++++++++++++++-----------
 1 file changed, 38 insertions(+), 25 deletions(-)

diff --git a/doubleml/tests/test_model_defaults.py b/doubleml/tests/test_model_defaults.py
index 5da2fbcaf..4df50b062 100644
--- a/doubleml/tests/test_model_defaults.py
+++ b/doubleml/tests/test_model_defaults.py
@@ -28,38 +28,23 @@
 dml_did_cs = DoubleMLDIDCS(dml_data_did_cs, Lasso(), LogisticRegression())
 dml_ssm = DoubleMLSSM(dml_data_ssm, Lasso(), LogisticRegression(), LogisticRegression())
 
-dml_plr.fit()
-dml_pliv.fit()
-dml_irm.fit()
-dml_iivm.fit()
-dml_cvar.fit()
-dml_did.fit()
-dml_did_cs.fit()
-dml_ssm.fit()
-
-dml_plr.bootstrap()
-dml_pliv.bootstrap()
-dml_irm.bootstrap()
-dml_iivm.bootstrap()
-dml_cvar.bootstrap()
-dml_did.bootstrap()
-dml_did_cs.bootstrap()
-dml_ssm.bootstrap()
-
 # nonlinear models
 dml_pq = DoubleMLPQ(dml_data_irm, ml_g=LogisticRegression(), ml_m=LogisticRegression())
 dml_lpq = DoubleMLLPQ(dml_data_iivm, ml_g=RandomForestClassifier(), ml_m=RandomForestClassifier())
 dml_qte = DoubleMLQTE(dml_data_irm, ml_g=RandomForestClassifier(), ml_m=RandomForestClassifier())
 
-dml_pq.fit()
-dml_lpq.fit()
-dml_qte.fit()
 
-dml_pq.bootstrap()
-dml_lpq.bootstrap()
-dml_qte.bootstrap()
+def _assert_is_none(dml_obj):
+    assert dml_obj.n_rep_boot is None
+    assert dml_obj.boot_method is None
+    assert dml_obj.framework is None
+    assert dml_obj.sensitivity_params is None
+    assert dml_obj.boot_t_stat is None
+
 
-policy_tree = dml_irm.policy_tree(features=dml_data_irm.data.drop(columns=["y", "d"]))
+def _fit_bootstrap(dml_obj):
+    dml_obj.fit()
+    dml_obj.bootstrap()
 
 
 def _assert_resampling_default_settings(dml_obj):
@@ -84,12 +69,16 @@ def _assert_resampling_default_settings(dml_obj):
 
 @pytest.mark.ci
 def test_plr_defaults():
+    _assert_is_none(dml_plr)
+    _fit_bootstrap(dml_plr)
     _assert_resampling_default_settings(dml_plr)
     assert dml_plr.score == 'partialling out'
 
 
 @pytest.mark.ci
 def test_pliv_defaults():
+    _assert_is_none(dml_pliv)
+    _fit_bootstrap(dml_pliv)
     _assert_resampling_default_settings(dml_pliv)
     assert dml_pliv.score == 'partialling out'
     assert dml_pliv.partialX
@@ -98,6 +87,8 @@ def test_pliv_defaults():
 
 @pytest.mark.ci
 def test_irm_defaults():
+    _assert_is_none(dml_irm)
+    _fit_bootstrap(dml_irm)
     _assert_resampling_default_settings(dml_irm)
     assert dml_irm.score == 'ATE'
     assert dml_irm.trimming_rule == 'truncate'
@@ -109,6 +100,8 @@ def test_irm_defaults():
 
 @pytest.mark.ci
 def test_iivm_defaults():
+    _assert_is_none(dml_iivm)
+    _fit_bootstrap(dml_iivm)
     _assert_resampling_default_settings(dml_iivm)
     assert dml_iivm.score == 'LATE'
     assert dml_iivm.subgroups == {'always_takers': True, 'never_takers': True}
@@ -119,6 +112,8 @@ def test_iivm_defaults():
 
 @pytest.mark.ci
 def test_cvar_defaults():
+    _assert_is_none(dml_cvar)
+    _fit_bootstrap(dml_cvar)
     _assert_resampling_default_settings(dml_cvar)
     assert dml_cvar.quantile == 0.5
     assert dml_cvar.treatment == 1
@@ -129,6 +124,8 @@ def test_cvar_defaults():
 
 @pytest.mark.ci
 def test_pq_defaults():
+    _assert_is_none(dml_pq)
+    _fit_bootstrap(dml_pq)
     _assert_resampling_default_settings(dml_pq)
     assert dml_pq.quantile == 0.5
     assert dml_pq.treatment == 1
@@ -140,6 +137,8 @@ def test_pq_defaults():
 
 @pytest.mark.ci
 def test_lpq_defaults():
+    _assert_is_none(dml_lpq)
+    _fit_bootstrap(dml_lpq)
     _assert_resampling_default_settings(dml_lpq)
     assert dml_lpq.quantile == 0.5
     assert dml_lpq.treatment == 1
@@ -151,6 +150,11 @@ def test_lpq_defaults():
 
 @pytest.mark.ci
 def test_qte_defaults():
+    assert dml_qte.n_rep_boot is None
+    assert dml_qte.boot_method is None
+    assert dml_qte.framework is None
+    assert dml_qte.boot_t_stat is None
+    _fit_bootstrap(dml_qte)
     # not fix since its a differen object added in future versions _assert_resampling_default_settings(dml_qte)
     assert dml_qte.quantiles == 0.5
     assert dml_qte.score == 'PQ'
@@ -161,6 +165,8 @@ def test_qte_defaults():
 
 @pytest.mark.ci
 def test_did_defaults():
+    _assert_is_none(dml_did)
+    _fit_bootstrap(dml_did)
     _assert_resampling_default_settings(dml_did)
     assert dml_did.score == 'observational'
     assert dml_did.in_sample_normalization
@@ -170,6 +176,8 @@ def test_did_defaults():
 
 @pytest.mark.ci
 def test_did_cs_defaults():
+    _assert_is_none(dml_did_cs)
+    _fit_bootstrap(dml_did_cs)
     _assert_resampling_default_settings(dml_did_cs)
     assert dml_did.score == 'observational'
     assert dml_did_cs.in_sample_normalization
@@ -179,6 +187,8 @@ def test_did_cs_defaults():
 
 @pytest.mark.ci
 def test_ssm_defaults():
+    _assert_is_none(dml_ssm)
+    _fit_bootstrap(dml_ssm)
     _assert_resampling_default_settings(dml_ssm)
     assert dml_ssm.score == 'missing-at-random'
     assert dml_ssm.trimming_rule == 'truncate'
@@ -200,6 +210,9 @@ def test_sensitivity_defaults():
 
 @pytest.mark.ci
 def test_policytree_defaults():
+    dml_irm = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression())
+    dml_irm.fit()
+    policy_tree = dml_irm.policy_tree(features=dml_data_irm.data.drop(columns=["y", "d"]))
     assert policy_tree.policy_tree.max_depth == 2
     assert policy_tree.policy_tree.min_samples_leaf == 8
     assert policy_tree.policy_tree.ccp_alpha == 0.01

From 87f6acc9d13c95f07b3f986d442545a94b3c2574 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Klaa=C3=9Fen?=
 <47529404+SvenKlaassen@users.noreply.github.com>
Date: Sat, 27 Jul 2024 09:08:53 +0200
Subject: [PATCH 76/98] extend model default tests for apo and apos

---
 doubleml/tests/test_model_defaults.py | 56 ++++++++++++++++++++-------
 1 file changed, 42 insertions(+), 14 deletions(-)

diff --git a/doubleml/tests/test_model_defaults.py b/doubleml/tests/test_model_defaults.py
index 4df50b062..b28274e24 100644
--- a/doubleml/tests/test_model_defaults.py
+++ b/doubleml/tests/test_model_defaults.py
@@ -1,8 +1,7 @@
 import pytest
 import numpy as np
 
-from doubleml import DoubleMLPLR, DoubleMLIRM, DoubleMLIIVM, DoubleMLPLIV, DoubleMLCVAR, DoubleMLPQ, \
-    DoubleMLLPQ, DoubleMLQTE, DoubleMLDID, DoubleMLDIDCS, DoubleMLSSM
+import doubleml as dml
 from doubleml.datasets import make_plr_CCDDHNR2018, make_irm_data, make_pliv_CHS2015, make_iivm_data, make_did_SZ2020, \
     make_ssm_data
 
@@ -19,19 +18,21 @@
 dml_data_ssm = make_ssm_data(n_obs=2000, mar=True)
 
 # linear models
-dml_plr = DoubleMLPLR(dml_data_plr, Lasso(), Lasso())
-dml_pliv = DoubleMLPLIV(dml_data_pliv, Lasso(), Lasso(), Lasso())
-dml_irm = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression())
-dml_iivm = DoubleMLIIVM(dml_data_iivm, Lasso(), LogisticRegression(), LogisticRegression())
-dml_cvar = DoubleMLCVAR(dml_data_irm, ml_g=RandomForestRegressor(), ml_m=RandomForestClassifier())
-dml_did = DoubleMLDID(dml_data_did, Lasso(), LogisticRegression())
-dml_did_cs = DoubleMLDIDCS(dml_data_did_cs, Lasso(), LogisticRegression())
-dml_ssm = DoubleMLSSM(dml_data_ssm, Lasso(), LogisticRegression(), LogisticRegression())
+dml_plr = dml.DoubleMLPLR(dml_data_plr, Lasso(), Lasso())
+dml_pliv = dml.DoubleMLPLIV(dml_data_pliv, Lasso(), Lasso(), Lasso())
+dml_irm = dml.DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression())
+dml_iivm = dml.DoubleMLIIVM(dml_data_iivm, Lasso(), LogisticRegression(), LogisticRegression())
+dml_cvar = dml.DoubleMLCVAR(dml_data_irm, ml_g=RandomForestRegressor(), ml_m=RandomForestClassifier())
+dml_did = dml.DoubleMLDID(dml_data_did, Lasso(), LogisticRegression())
+dml_did_cs = dml.DoubleMLDIDCS(dml_data_did_cs, Lasso(), LogisticRegression())
+dml_ssm = dml.DoubleMLSSM(dml_data_ssm, Lasso(), LogisticRegression(), LogisticRegression())
+dml_apo = dml.DoubleMLAPO(dml_data_irm, Lasso(), LogisticRegression(), treatment_level=0)
+dml_apos = dml.DoubleMLAPOS(dml_data_irm, Lasso(), LogisticRegression(), treatment_levels=[0, 1])
 
 # nonlinear models
-dml_pq = DoubleMLPQ(dml_data_irm, ml_g=LogisticRegression(), ml_m=LogisticRegression())
-dml_lpq = DoubleMLLPQ(dml_data_iivm, ml_g=RandomForestClassifier(), ml_m=RandomForestClassifier())
-dml_qte = DoubleMLQTE(dml_data_irm, ml_g=RandomForestClassifier(), ml_m=RandomForestClassifier())
+dml_pq = dml.DoubleMLPQ(dml_data_irm, ml_g=LogisticRegression(), ml_m=LogisticRegression())
+dml_lpq = dml.DoubleMLLPQ(dml_data_iivm, ml_g=RandomForestClassifier(), ml_m=RandomForestClassifier())
+dml_qte = dml.DoubleMLQTE(dml_data_irm, ml_g=RandomForestClassifier(), ml_m=RandomForestClassifier())
 
 
 def _assert_is_none(dml_obj):
@@ -196,6 +197,33 @@ def test_ssm_defaults():
     assert not dml_ssm.normalize_ipw
 
 
+@pytest.mark.ci
+def test_apo_defaults():
+    _assert_is_none(dml_apo)
+    _fit_bootstrap(dml_apo)
+    _assert_resampling_default_settings(dml_apo)
+    assert dml_apo.score == 'APO'
+    assert dml_apo.trimming_rule == 'truncate'
+    assert dml_apo.trimming_threshold == 1e-2
+    assert not dml_apo.normalize_ipw
+    assert set(dml_apo.weights.keys()) == set(['weights'])
+    assert np.array_equal(dml_apo.weights['weights'], np.ones((dml_apo._dml_data.n_obs,)))
+
+
+@pytest.mark.ci
+def test_apos_defaults():
+    assert dml_apos.n_rep_boot is None
+    assert dml_apo.boot_method is None
+    assert dml_apo.framework is None
+    assert dml_apo.boot_t_stat is None
+    _fit_bootstrap(dml_qte)
+    assert dml_apos.score == 'APO'
+    assert dml_apos.trimming_rule == 'truncate'
+    assert dml_apos.trimming_threshold == 1e-2
+    assert not dml_apos.normalize_ipw
+    assert np.array_equal(dml_apos.weights, np.ones((dml_apos._dml_data.n_obs,)))
+
+
 @pytest.mark.ci
 def test_sensitivity_defaults():
     input_dict = {'cf_y': 0.03,
@@ -210,7 +238,7 @@ def test_sensitivity_defaults():
 
 @pytest.mark.ci
 def test_policytree_defaults():
-    dml_irm = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression())
+    dml_irm = dml.DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression())
     dml_irm.fit()
     policy_tree = dml_irm.policy_tree(features=dml_data_irm.data.drop(columns=["y", "d"]))
     assert policy_tree.policy_tree.max_depth == 2

From 1628001f4ee4b0fbda3ce517c2724b2499d26bc8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Klaa=C3=9Fen?=
 <47529404+SvenKlaassen@users.noreply.github.com>
Date: Sun, 28 Jul 2024 12:50:37 +0200
Subject: [PATCH 77/98] fix model default test

---
 doubleml/tests/test_model_defaults.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doubleml/tests/test_model_defaults.py b/doubleml/tests/test_model_defaults.py
index b28274e24..8d7234d62 100644
--- a/doubleml/tests/test_model_defaults.py
+++ b/doubleml/tests/test_model_defaults.py
@@ -213,9 +213,9 @@ def test_apo_defaults():
 @pytest.mark.ci
 def test_apos_defaults():
     assert dml_apos.n_rep_boot is None
-    assert dml_apo.boot_method is None
-    assert dml_apo.framework is None
-    assert dml_apo.boot_t_stat is None
+    assert dml_apos.boot_method is None
+    assert dml_apos.framework is None
+    assert dml_apos.boot_t_stat is None
     _fit_bootstrap(dml_qte)
     assert dml_apos.score == 'APO'
     assert dml_apos.trimming_rule == 'truncate'

From f24d13a46e2720e87216883b0485baf40e5988b4 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Wed, 31 Jul 2024 06:49:19 +0200
Subject: [PATCH 78/98] add sensitivity_elements property to apos

---
 doubleml/irm/apos.py            | 13 +++++++++++++
 doubleml/irm/tests/test_apos.py |  2 ++
 2 files changed, 15 insertions(+)

diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py
index c0fabe39b..9104ef564 100644
--- a/doubleml/irm/apos.py
+++ b/doubleml/irm/apos.py
@@ -283,6 +283,19 @@ def modellist(self):
         """
         return self._modellist
 
+    @property
+    def sensitivity_elements(self):
+        """
+        Values of the sensitivity components after calling :meth:`fit`;
+        If available (e.g., PLR, IRM) a dictionary with entries ``sigma2``, ``nu2``, ``psi_sigma2``, ``psi_nu2``
+        and ``riesz_rep``.
+        """
+        if self._framework is None:
+            sensitivity_elements = None
+        else:
+            sensitivity_elements = self._framework.sensitivity_elements
+        return sensitivity_elements
+
     @property
     def summary(self):
         """
diff --git a/doubleml/irm/tests/test_apos.py b/doubleml/irm/tests/test_apos.py
index 6f8da9b90..ce1fb48f5 100644
--- a/doubleml/irm/tests/test_apos.py
+++ b/doubleml/irm/tests/test_apos.py
@@ -37,6 +37,7 @@ def test_apo_properties():
     assert dml_obj.n_rep_boot is None
     assert dml_obj.boot_t_stat is None
     assert dml_obj.boot_method is None
+    assert dml_obj.sensitivity_elements is None
 
     # check properties after fit
     dml_obj.fit()
@@ -49,6 +50,7 @@ def test_apo_properties():
     assert dml_obj.n_rep_boot is None
     assert dml_obj.boot_t_stat is None
     assert dml_obj.boot_method is None
+    assert dml_obj.sensitivity_elements is not None
 
     # check properties after bootstrap
     dml_obj.bootstrap()

From ac38ba77d7216006113195b86ad3c93942e4e518 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Wed, 31 Jul 2024 06:54:36 +0200
Subject: [PATCH 79/98] add sensitivity_params and sensitivity_analysis to apos

---
 doubleml/irm/apos.py                       | 62 ++++++++++++++++++++++
 doubleml/irm/tests/test_apos.py            |  6 +++
 doubleml/irm/tests/test_apos_exceptions.py |  3 ++
 3 files changed, 71 insertions(+)

diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py
index 9104ef564..d8283c1fb 100644
--- a/doubleml/irm/apos.py
+++ b/doubleml/irm/apos.py
@@ -296,6 +296,19 @@ def sensitivity_elements(self):
             sensitivity_elements = self._framework.sensitivity_elements
         return sensitivity_elements
 
+    @property
+    def sensitivity_params(self):
+        """
+        Values of the sensitivity parameters after calling :meth:`sesitivity_analysis`;
+        If available (e.g., PLR, IRM) a dictionary with entries ``theta``, ``se``, ``ci``, ``rv``
+        and ``rva``.
+        """
+        if self._framework is None:
+            sensitivity_params = None
+        else:
+            sensitivity_params = self._framework.sensitivity_params
+        return sensitivity_params
+
     @property
     def summary(self):
         """
@@ -413,6 +426,55 @@ def bootstrap(self, method='normal', n_rep_boot=500):
 
         return self
 
+    def sensitivity_analysis(self, cf_y=0.03, cf_d=0.03, rho=1.0, level=0.95, null_hypothesis=0.0):
+        """
+        Performs a sensitivity analysis to account for unobserved confounders.
+
+        The evaluated scenario is stored as a dictionary in the property ``sensitivity_params``.
+
+        Parameters
+        ----------
+        cf_y : float
+            Percentage of the residual variation of the outcome explained by latent/confounding variables.
+            Default is ``0.03``.
+
+        cf_d : float
+            Percentage gains in the variation of the Riesz representer generated by latent/confounding variables.
+            Default is ``0.03``.
+
+        rho : float
+            The correlation between the differences in short and long representations in the main regression and
+            Riesz representer. Has to be in [-1,1]. The absolute value determines the adversarial strength of the
+            confounding (maximizes at 1.0).
+            Default is ``1.0``.
+
+        level : float
+            The confidence level.
+            Default is ``0.95``.
+
+        null_hypothesis : float or numpy.ndarray
+            Null hypothesis for the effect. Determines the robustness values.
+            If it is a single float uses the same null hypothesis for all estimated parameters.
+            Else the array has to be of shape (n_coefs,).
+            Default is ``0.0``.
+
+        Returns
+        -------
+        self : object
+        """
+
+        if self._framework is None:
+            raise ValueError('Apply fit() before sensitivity_analysis().')
+        self._framework.sensitivity_analysis(
+            cf_y=cf_y,
+            cf_d=cf_d,
+            rho=rho,
+            level=level,
+            null_hypothesis=null_hypothesis
+        )
+
+        return self
+
     def draw_sample_splitting(self):
         """
         Draw sample splitting for DoubleML models.
diff --git a/doubleml/irm/tests/test_apos.py b/doubleml/irm/tests/test_apos.py
index ce1fb48f5..9ebc7591e 100644
--- a/doubleml/irm/tests/test_apos.py
+++ b/doubleml/irm/tests/test_apos.py
@@ -38,6 +38,7 @@ def test_apo_properties():
     assert dml_obj.boot_t_stat is None
     assert dml_obj.boot_method is None
     assert dml_obj.sensitivity_elements is None
+    assert dml_obj.sensitivity_params is None
 
     # check properties after fit
     dml_obj.fit()
@@ -51,6 +52,7 @@ def test_apo_properties():
     assert dml_obj.boot_t_stat is None
     assert dml_obj.boot_method is None
     assert dml_obj.sensitivity_elements is not None
+    assert dml_obj.sensitivity_params is None
 
     # check properties after bootstrap
     dml_obj.bootstrap()
@@ -58,6 +60,10 @@ def test_apo_properties():
     assert dml_obj.boot_t_stat is not None
     assert dml_obj.boot_method is not None
 
+    # check properties after sensitivity analysis
+    dml_obj.sensitivity_analysis()
+    assert dml_obj.sensitivity_params is not None
+
 
 @pytest.fixture(scope='module',
                 params=[[LinearRegression(),
diff --git a/doubleml/irm/tests/test_apos_exceptions.py b/doubleml/irm/tests/test_apos_exceptions.py
index e9d972f13..c6bee072c 100644
--- a/doubleml/irm/tests/test_apos_exceptions.py
+++ b/doubleml/irm/tests/test_apos_exceptions.py
@@ -98,6 +98,9 @@ def test_apos_exception_properties_and_methods():
     msg = r'Apply fit\(\) before bootstrap\(\).'
     with pytest.raises(ValueError, match=msg):
         dml_obj.bootstrap()
+    msg = r'Apply fit\(\) before sensitivity_analysis\(\).'
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.sensitivity_analysis()
 
 
 @pytest.mark.ci

From 8f323a59001f8281f7c0105345128388ab1f8b77 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Wed, 31 Jul 2024 07:05:22 +0200
Subject: [PATCH 80/98] add sensitivity_plot to apos

---
 doubleml/irm/apos.py                       | 71 ++++++++++++++++++++++
 doubleml/irm/tests/test_apos_exceptions.py |  3 +
 2 files changed, 74 insertions(+)

diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py
index d8283c1fb..e04f55924 100644
--- a/doubleml/irm/apos.py
+++ b/doubleml/irm/apos.py
@@ -475,6 +475,77 @@ def sensitivity_analysis(self, cf_y=0.03, cf_d=0.03, rho=1.0, level=0.95, null_h
 
         return self
 
+    def sensitivity_plot(self, idx_treatment=0, value='theta', rho=1.0, level=0.95, null_hypothesis=0.0,
+                         include_scenario=True, benchmarks=None, fill=True, grid_bounds=(0.15, 0.15), grid_size=100):
+        """
+        Contour plot of the sensivity with respect to latent/confounding variables.
+
+        Parameters
+        ----------
+        idx_treatment : int
+            Index of the treatment to perform the sensitivity analysis.
+            Default is ``0``.
+
+        value : str
+            Determines which contours to plot. Valid values are ``'theta'`` (refers to the bounds)
+            and ``'ci'`` (refers to the bounds including statistical uncertainty).
+            Default is ``'theta'``.
+
+        rho: float
+            The correlation between the differences in short and long representations in the main regression and
+            Riesz representer. Has to be in [-1,1]. The absolute value determines the adversarial strength of the
+            confounding (maximizes at 1.0).
+            Default is ``1.0``.
+
+        level : float
+            The confidence level.
+            Default is ``0.95``.
+
+        null_hypothesis : float
+            Null hypothesis for the effect. Determines the direction of the contour lines.
+
+        include_scenario : bool
+            Indicates whether to highlight the scenario from the call of :meth:`sensitivity_analysis`.
+            Default is ``True``.
+
+        benchmarks : dict or None
+            Dictionary of benchmarks to be included in the plot. The keys are ``cf_y``, ``cf_d`` and ``name``.
+            Default is ``None``.
+
+        fill : bool
+            Indicates whether to use a heatmap style or only contour lines.
+            Default is ``True``.
+
+        grid_bounds : tuple
+            Determines the evaluation bounds of the grid for ``cf_d`` and ``cf_y``. Has to contain two floats in [0, 1).
+            Default is ``(0.15, 0.15)``.
+
+        grid_size : int
+            Determines the number of evaluation points of the grid.
+            Default is ``100``.
+
+        Returns
+        -------
+        fig : object
+            Plotly figure of the sensitivity contours.
+        """
+        if self._framework is None:
+            raise ValueError('Apply fit() before sensitivity_plot().')
+        fig = self._framework.sensitivity_plot(
+            idx_treatment=idx_treatment,
+            value=value,
+            rho=rho,
+            level=level,
+            null_hypothesis=null_hypothesis,
+            include_scenario=include_scenario,
+            benchmarks=benchmarks,
+            fill=fill,
+            grid_bounds=grid_bounds,
+            grid_size=grid_size
+        )
+
+        return fig
+
     def draw_sample_splitting(self):
         """
         Draw sample splitting for DoubleML models.
diff --git a/doubleml/irm/tests/test_apos_exceptions.py b/doubleml/irm/tests/test_apos_exceptions.py
index c6bee072c..9d17447b7 100644
--- a/doubleml/irm/tests/test_apos_exceptions.py
+++ b/doubleml/irm/tests/test_apos_exceptions.py
@@ -101,6 +101,9 @@ def test_apos_exception_properties_and_methods():
     msg = r'Apply fit\(\) before sensitivity_analysis\(\).'
     with pytest.raises(ValueError, match=msg):
         dml_obj.sensitivity_analysis()
+    msg = r'Apply fit\(\) before sensitivity_plot\(\).'
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.sensitivity_plot()
 
 
 @pytest.mark.ci

From 706004c7ac42e810469c6d89ba8a2e19d24d6e3f Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Wed, 31 Jul 2024 07:30:11 +0200
Subject: [PATCH 81/98] Update test_return_types.py

---
 doubleml/tests/test_return_types.py | 141 +++++++++++-----------------
 1 file changed, 56 insertions(+), 85 deletions(-)

diff --git a/doubleml/tests/test_return_types.py b/doubleml/tests/test_return_types.py
index d76f2d147..79b5fc933 100644
--- a/doubleml/tests/test_return_types.py
+++ b/doubleml/tests/test_return_types.py
@@ -18,6 +18,7 @@
     DoubleMLPolicyTree,
     DoubleMLFramework,
     DoubleMLSSM,
+    DoubleMLAPO
 )
 from doubleml.datasets import (
     make_plr_CCDDHNR2018,
@@ -61,6 +62,7 @@
 dml_did_cs = DoubleMLDIDCS(dml_data_did_cs, Lasso(), LogisticRegression())
 dml_did_cs_binary_outcome = DoubleMLDIDCS(dml_data_did_cs_binary_outcome, LogisticRegression(), LogisticRegression())
 dml_ssm = DoubleMLSSM(dml_data_ssm, ml_g=Lasso(), ml_m=LogisticRegression(), ml_pi=LogisticRegression())
+dml_apo = DoubleMLAPO(dml_data_irm, Lasso(), LogisticRegression(), treatment_level=0)
 
 
 @pytest.mark.ci
@@ -77,7 +79,8 @@
                           (dml_did_binary_outcome, DoubleMLDID),
                           (dml_did_cs, DoubleMLDIDCS),
                           (dml_did_cs_binary_outcome, DoubleMLDIDCS),
-                          (dml_ssm, DoubleMLSSM)])
+                          (dml_ssm, DoubleMLSSM),
+                          (dml_apo, DoubleMLAPO)])
 def test_return_types(dml_obj, cls):
     # ToDo: A second test case with multiple treatment variables would be helpful
     assert isinstance(dml_obj.__str__(), str)
@@ -168,11 +171,16 @@ def test_return_types(dml_obj, cls):
 ssm_obj.fit()
 ssm_obj.bootstrap(n_rep_boot=n_rep_boot)
 
+apo_obj = DoubleMLAPO(dml_data_irm, Lasso(), LogisticRegression(), treatment_level=0,
+                        n_rep=n_rep, n_folds=n_folds)
+apo_obj.fit()
+apo_obj.bootstrap(n_rep_boot=n_rep_boot)
+
 
 @pytest.mark.ci
 @pytest.mark.parametrize('dml_obj',
                          [plr_obj, pliv_obj,  irm_obj,  iivm_obj, cvar_obj, pq_obj, lpq_obj,
-                          did_obj, did_cs_obj])
+                          did_obj, did_cs_obj, ssm_obj, apo_obj])
 def test_property_types_and_shapes(dml_obj):
     # not checked: learner, learner_names, params, params_names, score
     # already checked: summary
@@ -300,6 +308,10 @@ def test_stored_predictions():
     assert ssm_obj.predictions['ml_m'].shape == (n_obs, n_rep, n_treat)
     assert ssm_obj.predictions['ml_pi'].shape == (n_obs, n_rep, n_treat)
 
+    assert apo_obj.predictions['ml_g0'].shape == (n_obs, n_rep, n_treat)
+    assert apo_obj.predictions['ml_g1'].shape == (n_obs, n_rep, n_treat)
+    assert apo_obj.predictions['ml_m'].shape == (n_obs, n_rep, n_treat)
+
 
 @pytest.mark.ci
 def test_stored_nuisance_targets():
@@ -347,6 +359,10 @@ def test_stored_nuisance_targets():
     assert ssm_obj.nuisance_targets['ml_m'].shape == (n_obs, n_rep, n_treat)
     assert ssm_obj.nuisance_targets['ml_pi'].shape == (n_obs, n_rep, n_treat)
 
+    assert apo_obj.nuisance_targets['ml_g0'].shape == (n_obs, n_rep, n_treat)
+    assert apo_obj.nuisance_targets['ml_g1'].shape == (n_obs, n_rep, n_treat)
+    assert apo_obj.nuisance_targets['ml_m'].shape == (n_obs, n_rep, n_treat)
+
 
 @pytest.mark.ci
 def test_nuisance_loss():
@@ -394,100 +410,55 @@ def test_nuisance_loss():
     assert ssm_obj.nuisance_loss['ml_m'].shape == (n_rep, n_treat)
     assert ssm_obj.nuisance_loss['ml_pi'].shape == (n_rep, n_treat)
 
+    assert apo_obj.nuisance_loss['ml_g0'].shape == (n_rep, n_treat)
+    assert apo_obj.nuisance_loss['ml_g1'].shape == (n_rep, n_treat)
+    assert apo_obj.nuisance_loss['ml_m'].shape == (n_rep, n_treat)
 
-@pytest.mark.ci
-def test_sensitivity():
 
-    var_keys = ['sigma2', 'nu2']
-    score_keys = ['psi_sigma2', 'psi_nu2', 'riesz_rep']
-    benchmarks = {'cf_y': [0.1, 0.2], 'cf_d': [0.15, 0.2], 'name': ["test1", "test2"]}
+def _test_sensitivity_return_types(dml_obj, n_rep, n_treat, benchmarking_set):
+    assert isinstance(dml_obj.sensitivity_elements, dict)
+    for key in ['sigma2', 'nu2']:
+        assert isinstance(dml_obj.sensitivity_elements[key], np.ndarray)
+        assert dml_obj.sensitivity_elements[key].shape == (1, n_rep, n_treat)
+    for key in ['psi_sigma2', 'psi_nu2', 'riesz_rep']:
+        assert isinstance(dml_obj.sensitivity_elements[key], np.ndarray)
+        assert dml_obj.sensitivity_elements[key].shape == (n_obs, n_rep, n_treat)
 
-    # PLR
-    assert isinstance(plr_obj.sensitivity_elements, dict)
-    for key in var_keys:
-        assert isinstance(plr_obj.sensitivity_elements[key], np.ndarray)
-        assert plr_obj.sensitivity_elements[key].shape == (1, n_rep, n_treat)
-    for key in score_keys:
-        assert isinstance(plr_obj.sensitivity_elements[key], np.ndarray)
-        assert plr_obj.sensitivity_elements[key].shape == (n_obs, n_rep, n_treat)
-
-    assert isinstance(plr_obj.sensitivity_summary, str)
-    plr_obj.sensitivity_analysis()
-    assert isinstance(plr_obj.sensitivity_summary, str)
-    assert isinstance(plr_obj.sensitivity_plot(), plotly.graph_objs._figure.Figure)
-    assert isinstance(plr_obj.sensitivity_plot(value='ci', benchmarks=benchmarks), plotly.graph_objs._figure.Figure)
-    assert isinstance(plr_obj.framework._calc_sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0, level=0.95), dict)
-    assert isinstance(
-        plr_obj.framework._calc_robustness_value(null_hypothesis=0.0, level=0.95, rho=1.0, idx_treatment=0),
-        tuple)
-    plr_benchmark = plr_obj.sensitivity_benchmark(benchmarking_set=["X1"])
-    assert isinstance(plr_benchmark, pd.DataFrame)
+    assert isinstance(dml_obj.sensitivity_summary, str)
+    dml_obj.sensitivity_analysis()
+    assert isinstance(dml_obj.sensitivity_summary, str)
+    assert isinstance(dml_obj.sensitivity_plot(), plotly.graph_objs._figure.Figure)
+    benchmarks = {'cf_y': [0.1, 0.2], 'cf_d': [0.15, 0.2], 'name': ["test1", "test2"]}
+    assert isinstance(dml_obj.sensitivity_plot(value='ci', benchmarks=benchmarks), plotly.graph_objs._figure.Figure)
 
-    # DID
-    assert isinstance(irm_obj.sensitivity_elements, dict)
-    for key in var_keys:
-        assert isinstance(irm_obj.sensitivity_elements[key], np.ndarray)
-        assert irm_obj.sensitivity_elements[key].shape == (1, n_rep, n_treat)
-    for key in score_keys:
-        assert isinstance(irm_obj.sensitivity_elements[key], np.ndarray)
-        assert irm_obj.sensitivity_elements[key].shape == (n_obs, n_rep, n_treat)
-
-    assert isinstance(irm_obj.sensitivity_summary, str)
-    irm_obj.sensitivity_analysis()
-    assert isinstance(irm_obj.sensitivity_summary, str)
-    assert isinstance(irm_obj.sensitivity_plot(), plotly.graph_objs._figure.Figure)
-    assert isinstance(irm_obj.sensitivity_plot(value='ci', benchmarks=benchmarks), plotly.graph_objs._figure.Figure)
-    assert isinstance(irm_obj.framework._calc_sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0, level=0.95), dict)
+    assert isinstance(dml_obj.framework._calc_sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0, level=0.95), dict)
     assert isinstance(
-        irm_obj.framework._calc_robustness_value(null_hypothesis=0.0, level=0.95, rho=1.0, idx_treatment=0),
+        dml_obj.framework._calc_robustness_value(null_hypothesis=0.0, level=0.95, rho=1.0, idx_treatment=0),
         tuple
     )
-    irm_benchmark = irm_obj.sensitivity_benchmark(benchmarking_set=["X1"])
-    assert isinstance(irm_benchmark, pd.DataFrame)
+    benchmark = dml_obj.sensitivity_benchmark(benchmarking_set=benchmarking_set)
+    assert isinstance(benchmark, pd.DataFrame)
+
+    return
+
+
+@pytest.mark.ci
+def test_sensitivity():
+
+    # PLR
+    _test_sensitivity_return_types(plr_obj, n_rep, n_treat, benchmarking_set=["X1"])
+
+    # IRM
+    _test_sensitivity_return_types(irm_obj, n_rep, n_treat, benchmarking_set=["X1"])
 
     # DID
-    assert isinstance(did_obj.sensitivity_elements, dict)
-    for key in var_keys:
-        assert isinstance(did_obj.sensitivity_elements[key], np.ndarray)
-        assert did_obj.sensitivity_elements[key].shape == (1, n_rep, n_treat)
-    for key in score_keys:
-        assert isinstance(did_obj.sensitivity_elements[key], np.ndarray)
-        assert did_obj.sensitivity_elements[key].shape == (n_obs, n_rep, n_treat)
-
-    assert isinstance(did_obj.sensitivity_summary, str)
-    did_obj.sensitivity_analysis()
-    assert isinstance(did_obj.sensitivity_summary, str)
-    assert isinstance(did_obj.sensitivity_plot(), plotly.graph_objs._figure.Figure)
-    assert isinstance(did_obj.sensitivity_plot(value='ci', benchmarks=benchmarks), plotly.graph_objs._figure.Figure)
-    assert isinstance(did_obj.framework._calc_sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0, level=0.95), dict)
-    assert isinstance(
-        did_obj.framework._calc_robustness_value(null_hypothesis=0.0, level=0.95, rho=1.0, idx_treatment=0),
-        tuple
-    )
-    did_benchmark = did_obj.sensitivity_benchmark(benchmarking_set=['Z1'])
-    assert isinstance(did_benchmark, pd.DataFrame)
+    _test_sensitivity_return_types(did_obj, n_rep, n_treat, benchmarking_set=["Z1"])
 
     # DIDCS
-    assert isinstance(did_cs_obj.sensitivity_elements, dict)
-    for key in var_keys:
-        assert isinstance(did_cs_obj.sensitivity_elements[key], np.ndarray)
-        assert did_cs_obj.sensitivity_elements[key].shape == (1, n_rep, n_treat)
-    for key in score_keys:
-        assert isinstance(did_cs_obj.sensitivity_elements[key], np.ndarray)
-        assert did_cs_obj.sensitivity_elements[key].shape == (n_obs, n_rep, n_treat)
-
-    assert isinstance(did_cs_obj.sensitivity_summary, str)
-    did_cs_obj.sensitivity_analysis()
-    assert isinstance(did_cs_obj.sensitivity_summary, str)
-    assert isinstance(did_cs_obj.sensitivity_plot(), plotly.graph_objs._figure.Figure)
-    assert isinstance(did_cs_obj.sensitivity_plot(value='ci', benchmarks=benchmarks), plotly.graph_objs._figure.Figure)
-    assert isinstance(did_cs_obj.framework._calc_sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0, level=0.95), dict)
-    assert isinstance(
-        did_cs_obj.framework._calc_robustness_value(null_hypothesis=0.0, level=0.95, rho=1.0, idx_treatment=0),
-        tuple
-    )
-    did_cs_benchmark = did_cs_obj.sensitivity_benchmark(benchmarking_set=['Z1'])
-    assert isinstance(did_cs_benchmark, pd.DataFrame)
+    _test_sensitivity_return_types(did_cs_obj, n_rep, n_treat, benchmarking_set=["Z1"])
+
+    # APO
+    _test_sensitivity_return_types(apo_obj, n_rep, n_treat, benchmarking_set=["X1"])
 
 
 @pytest.mark.ci

From fb2dfa63f177d0cf78a6f7fd50a5cf85569ba06b Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Wed, 31 Jul 2024 07:32:57 +0200
Subject: [PATCH 82/98] fix format

---
 doubleml/tests/test_return_types.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doubleml/tests/test_return_types.py b/doubleml/tests/test_return_types.py
index 79b5fc933..a9014d089 100644
--- a/doubleml/tests/test_return_types.py
+++ b/doubleml/tests/test_return_types.py
@@ -172,7 +172,7 @@ def test_return_types(dml_obj, cls):
 ssm_obj.bootstrap(n_rep_boot=n_rep_boot)
 
 apo_obj = DoubleMLAPO(dml_data_irm, Lasso(), LogisticRegression(), treatment_level=0,
-                        n_rep=n_rep, n_folds=n_folds)
+                      n_rep=n_rep, n_folds=n_folds)
 apo_obj.fit()
 apo_obj.bootstrap(n_rep_boot=n_rep_boot)
 

From f13ba784ea034713dad05c86fdd9cdc4e711b33c Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Wed, 31 Jul 2024 08:01:24 +0200
Subject: [PATCH 83/98] add sensitivity_summary to framework obj

---
 doubleml/double_ml_framework.py              | 49 ++++++++++++++++++++
 doubleml/tests/test_framework_sensitivity.py | 23 +++++++++
 2 files changed, 72 insertions(+)

diff --git a/doubleml/double_ml_framework.py b/doubleml/double_ml_framework.py
index 633d6464e..d6c54042e 100644
--- a/doubleml/double_ml_framework.py
+++ b/doubleml/double_ml_framework.py
@@ -225,6 +225,55 @@ def summary(self):
                                       self.pvals, ci, self._treatment_names)
         return df_summary
 
+    @property
+    def sensitivity_summary(self):
+        """
+        Returns a summary for the sensitivity analysis after calling :meth:`sensitivity_analysis`.
+
+        Returns
+        -------
+        res : str
+            Summary for the sensitivity analysis.
+        """
+        header = '================== Sensitivity Analysis ==================\n'
+        if self.sensitivity_params is None:
+            res = header + 'Apply sensitivity_analysis() to generate sensitivity_summary.'
+        else:
+            sig_level = f'Significance Level: level={self.sensitivity_params["input"]["level"]}\n'
+            scenario_params = f'Sensitivity parameters: cf_y={self.sensitivity_params["input"]["cf_y"]}; ' \
+                              f'cf_d={self.sensitivity_params["input"]["cf_d"]}, ' \
+                              f'rho={self.sensitivity_params["input"]["rho"]}'
+
+            theta_and_ci_col_names = ['CI lower', 'theta lower', ' theta', 'theta upper', 'CI upper']
+            theta_and_ci = np.transpose(np.vstack((self.sensitivity_params['ci']['lower'],
+                                                   self.sensitivity_params['theta']['lower'],
+                                                   self.thetas,
+                                                   self.sensitivity_params['theta']['upper'],
+                                                   self.sensitivity_params['ci']['upper'])))
+            df_theta_and_ci = pd.DataFrame(theta_and_ci,
+                                           columns=theta_and_ci_col_names,
+                                           index=self.treatment_names)
+            theta_and_ci_summary = str(df_theta_and_ci)
+
+            rvs_col_names = ['H_0', 'RV (%)', 'RVa (%)']
+            rvs = np.transpose(np.vstack((self.sensitivity_params['rv'],
+                                          self.sensitivity_params['rva']))) * 100
+
+            df_rvs = pd.DataFrame(np.column_stack((self.sensitivity_params["input"]["null_hypothesis"], rvs)),
+                                  columns=rvs_col_names,
+                                  index=self.treatment_names)
+            rvs_summary = str(df_rvs)
+
+            res = header + \
+                '\n------------------ Scenario          ------------------\n' + \
+                sig_level + scenario_params + '\n' + \
+                '\n------------------ Bounds with CI    ------------------\n' + \
+                theta_and_ci_summary + '\n' + \
+                '\n------------------ Robustness Values ------------------\n' + \
+                rvs_summary
+
+        return res
+
     def __add__(self, other):
 
         if isinstance(other, DoubleMLFramework):
diff --git a/doubleml/tests/test_framework_sensitivity.py b/doubleml/tests/test_framework_sensitivity.py
index 5b1be8bd5..044d89d22 100644
--- a/doubleml/tests/test_framework_sensitivity.py
+++ b/doubleml/tests/test_framework_sensitivity.py
@@ -43,6 +43,7 @@ def dml_framework_sensitivity_fixture(n_rep, generate_data_simple):
         'dml_obj': dml_irm_obj,
         'dml_obj_2': dml_irm_obj_2,
         'dml_framework_obj': dml_framework_obj,
+        'dml_framework_obj_2': dml_framework_obj_2,
         'dml_framework_obj_add_obj': dml_framework_obj_add_obj,
         'dml_framework_obj_sub_obj': dml_framework_obj_sub_obj,
         'dml_framework_obj_mul_obj': dml_framework_obj_mul_obj,
@@ -59,6 +60,7 @@ def test_dml_framework_sensitivity_shapes(dml_framework_sensitivity_fixture):
     n_obs = dml_framework_sensitivity_fixture['dml_framework_obj'].n_obs
 
     object_list = ['dml_framework_obj',
+                   'dml_framework_obj_2',
                    'dml_framework_obj_add_obj',
                    'dml_framework_obj_sub_obj',
                    'dml_framework_obj_mul_obj']
@@ -81,3 +83,24 @@ def test_dml_framework_sensitivity_shapes(dml_framework_sensitivity_fixture):
     for key in score_keys:
         assert dml_framework_sensitivity_fixture['dml_framework_obj_concat']._sensitivity_elements[key].shape == \
             (n_obs, 2, n_rep)
+
+
+@pytest.mark.ci
+def test_dml_framework_sensitivity_summary(dml_framework_sensitivity_fixture):
+    # summary without sensitivity analysis
+    sensitivity_summary = dml_framework_sensitivity_fixture['dml_framework_obj_2'].sensitivity_summary
+    substring = 'Apply sensitivity_analysis() to generate sensitivity_summary.'
+    assert substring in sensitivity_summary
+
+    # summary with sensitivity analysis
+    sensitivity_summary = dml_framework_sensitivity_fixture['dml_framework_obj'].sensitivity_summary
+    assert isinstance(sensitivity_summary, str)
+    substrings = [
+        '\n------------------ Scenario          ------------------\n',
+        '\n------------------ Bounds with CI    ------------------\n',
+        '\n------------------ Robustness Values ------------------\n',
+        'Significance Level: level=',
+        'Sensitivity parameters: cf_y='
+    ]
+    for substring in substrings:
+        assert substring in sensitivity_summary

From e0715c8ceb2d8c63c20104146c920d8c8d831148 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Wed, 31 Jul 2024 08:14:14 +0200
Subject: [PATCH 84/98] move sensitivity_summary to DoubleMLFramework class

---
 doubleml/double_ml.py                      | 41 +++-------------------
 doubleml/irm/apos.py                       | 16 +++++++++
 doubleml/irm/tests/test_apos_exceptions.py |  3 ++
 doubleml/tests/test_exceptions.py          |  7 ++++
 4 files changed, 30 insertions(+), 37 deletions(-)

diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py
index 4bb88dfba..71f8b4418 100644
--- a/doubleml/double_ml.py
+++ b/doubleml/double_ml.py
@@ -1436,44 +1436,11 @@ def sensitivity_summary(self):
         res : str
             Summary for the sensitivity analysis.
         """
-        header = '================== Sensitivity Analysis ==================\n'
-        if self.sensitivity_params is None:
-            res = header + 'Apply sensitivity_analysis() to generate sensitivity_summary.'
+        if self._framework is None:
+            raise ValueError('Apply sensitivity_analysis() before sensitivity_summary.')
         else:
-            sig_level = f'Significance Level: level={self.sensitivity_params["input"]["level"]}\n'
-            scenario_params = f'Sensitivity parameters: cf_y={self.sensitivity_params["input"]["cf_y"]}; ' \
-                              f'cf_d={self.sensitivity_params["input"]["cf_d"]}, ' \
-                              f'rho={self.sensitivity_params["input"]["rho"]}'
-
-            theta_and_ci_col_names = ['CI lower', 'theta lower', ' theta', 'theta upper', 'CI upper']
-            theta_and_ci = np.transpose(np.vstack((self.sensitivity_params['ci']['lower'],
-                                                   self.sensitivity_params['theta']['lower'],
-                                                   self.coef,
-                                                   self.sensitivity_params['theta']['upper'],
-                                                   self.sensitivity_params['ci']['upper'])))
-            df_theta_and_ci = pd.DataFrame(theta_and_ci,
-                                           columns=theta_and_ci_col_names,
-                                           index=self._dml_data.d_cols)
-            theta_and_ci_summary = str(df_theta_and_ci)
-
-            rvs_col_names = ['H_0', 'RV (%)', 'RVa (%)']
-            rvs = np.transpose(np.vstack((self.sensitivity_params['rv'],
-                                          self.sensitivity_params['rva']))) * 100
-
-            df_rvs = pd.DataFrame(np.column_stack((self.sensitivity_params["input"]["null_hypothesis"], rvs)),
-                                  columns=rvs_col_names,
-                                  index=self._dml_data.d_cols)
-            rvs_summary = str(df_rvs)
-
-            res = header + \
-                '\n------------------ Scenario          ------------------\n' + \
-                sig_level + scenario_params + '\n' + \
-                '\n------------------ Bounds with CI    ------------------\n' + \
-                theta_and_ci_summary + '\n' + \
-                '\n------------------ Robustness Values ------------------\n' + \
-                rvs_summary
-
-        return res
+            sensitivity_summary = self._framework.sensitivity_summary
+        return sensitivity_summary
 
     def sensitivity_plot(self, idx_treatment=0, value='theta', rho=1.0, level=0.95, null_hypothesis=0.0,
                          include_scenario=True, benchmarks=None, fill=True, grid_bounds=(0.15, 0.15), grid_size=100):
diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py
index e04f55924..c44198bc9 100644
--- a/doubleml/irm/apos.py
+++ b/doubleml/irm/apos.py
@@ -323,6 +323,22 @@ def summary(self):
                                           self.pval, ci, self._treatment_levels)
         return df_summary
 
+    @property
+    def sensitivity_summary(self):
+        """
+        Returns a summary for the sensitivity analysis after calling :meth:`sensitivity_analysis`.
+
+        Returns
+        -------
+        res : str
+            Summary for the sensitivity analysis.
+        """
+        if self._framework is None:
+            raise ValueError('Apply sensitivity_analysis() before sensitivity_summary.')
+        else:
+            sensitivity_summary = self._framework.sensitivity_summary
+        return sensitivity_summary
+
     def fit(self, n_jobs_models=None, n_jobs_cv=None, store_predictions=True, store_models=False, external_predictions=None):
         """
         Estimate DoubleMLAPOS models.
diff --git a/doubleml/irm/tests/test_apos_exceptions.py b/doubleml/irm/tests/test_apos_exceptions.py
index 9d17447b7..7b3c8bdb0 100644
--- a/doubleml/irm/tests/test_apos_exceptions.py
+++ b/doubleml/irm/tests/test_apos_exceptions.py
@@ -104,6 +104,9 @@ def test_apos_exception_properties_and_methods():
     msg = r'Apply fit\(\) before sensitivity_plot\(\).'
     with pytest.raises(ValueError, match=msg):
         dml_obj.sensitivity_plot()
+    msg = r'Apply sensitivity_analysis\(\) before sensitivity_summary.'
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.sensitivity_summary
 
 
 @pytest.mark.ci
diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py
index b8b690f34..97b8dac41 100644
--- a/doubleml/tests/test_exceptions.py
+++ b/doubleml/tests/test_exceptions.py
@@ -1107,6 +1107,13 @@ def test_doubleml_sensitivity_inputs():
         dml_irm.sensitivity_analysis()
 
 
+def test_doubleml_sensitivity_summary():
+    dml_irm = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression(), trimming_threshold=0.1)
+    msg = r'Apply sensitivity_analysis\(\) before sensitivity_summary.'
+    with pytest.raises(ValueError, match=msg):
+        _ = dml_irm.sensitivity_summary()
+
+
 @pytest.mark.ci
 def test_doubleml_sensitivity_benchmark():
     dml_irm = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression(), trimming_threshold=0.1)

From 04fae5ed22db07ae9dd5c427e98d796471a270e2 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Wed, 31 Jul 2024 09:57:55 +0200
Subject: [PATCH 85/98] fix gain statistics for multiple treatments

---
 doubleml/utils/gain_statistics.py             |  4 ++--
 .../tests/test_exceptions_gain_statistics.py  | 24 +++++++++----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/doubleml/utils/gain_statistics.py b/doubleml/utils/gain_statistics.py
index 3c50d084a..5a05e1b2a 100644
--- a/doubleml/utils/gain_statistics.py
+++ b/doubleml/utils/gain_statistics.py
@@ -56,7 +56,7 @@ def gain_statistics(dml_long, dml_short):
     if not isinstance(dml_short.all_coef, np.ndarray):
         raise TypeError("dml_short.all_coef does not contain the necessary coefficients. Expected numpy.ndarray.")
 
-    expected_shape = (dml_long.sensitivity_elements['sigma2'].shape[2], dml_long.sensitivity_elements['sigma2'].shape[1])
+    expected_shape = (dml_long.sensitivity_elements['sigma2'].shape[1], dml_long.sensitivity_elements['sigma2'].shape[2])
     if dml_long.all_coef.shape != expected_shape:
         raise ValueError("dml_long.all_coef does not contain the necessary coefficients. Expected shape: " +
                          str(expected_shape))
@@ -83,7 +83,7 @@ def gain_statistics(dml_long, dml_short):
     cf_d_benchmark = np.median(all_cf_d_benchmark, axis=0)
 
     # change in estimates (slightly different to paper)
-    all_delta_theta = np.transpose(dml_short.all_coef - dml_long.all_coef)
+    all_delta_theta = dml_short.all_coef - dml_long.all_coef
     delta_theta = np.median(all_delta_theta, axis=0)
 
     # degree of adversity
diff --git a/doubleml/utils/tests/test_exceptions_gain_statistics.py b/doubleml/utils/tests/test_exceptions_gain_statistics.py
index 805a84ed0..c4f3d3af3 100644
--- a/doubleml/utils/tests/test_exceptions_gain_statistics.py
+++ b/doubleml/utils/tests/test_exceptions_gain_statistics.py
@@ -22,13 +22,13 @@ def test_doubleml_exception_data():
             'sigma2': np.random.normal(size=(n_obs, n_rep, n_coef)),
             'nu2': np.random.normal(size=(n_obs, n_rep, n_coef))
         },
-        all_coef=np.random.normal(size=(n_coef, n_rep))
+        all_coef=np.random.normal(size=(n_rep, n_coef))
     )
 
     # incorrect types
     dml_incorrect = test_dml_class(
             sensitivity_elements=np.random.normal(size=(n_obs, n_rep, n_coef)),
-            all_coef=np.random.normal(size=(n_coef, n_rep))
+            all_coef=np.random.normal(size=(n_rep, n_coef))
         )
     msg = r"dml_long does not contain the necessary sensitivity elements\. Expected dict for dml_long\.sensitivity_elements\."
     with pytest.raises(TypeError, match=msg):
@@ -43,7 +43,7 @@ def test_doubleml_exception_data():
             sensitivity_elements={
                 'sigma2': np.random.normal(size=(n_obs, n_rep, n_coef)),
             },
-            all_coef=np.random.normal(size=(n_coef, n_rep))
+            all_coef=np.random.normal(size=(n_rep, n_coef))
         )
     msg = r"dml_long does not contain the necessary sensitivity elements\. Required keys are: \['sigma2', 'nu2'\]"
     with pytest.raises(ValueError, match=msg):
@@ -58,7 +58,7 @@ def test_doubleml_exception_data():
                 'sigma2': {},
                 'nu2': np.random.normal(size=(n_obs, n_rep, n_coef))
             },
-            all_coef=np.random.normal(size=(n_coef, n_rep))
+            all_coef=np.random.normal(size=(n_rep, n_coef))
         )
     msg = r"dml_long does not contain the necessary sensitivity elements\. Expected numpy\.ndarray for key sigma2\."
     with pytest.raises(TypeError, match=msg):
@@ -72,7 +72,7 @@ def test_doubleml_exception_data():
             'sigma2': np.random.normal(size=(n_obs, n_rep, n_coef)),
             'nu2': {}
         },
-        all_coef=np.random.normal(size=(n_coef, n_rep))
+        all_coef=np.random.normal(size=(n_rep, n_coef))
     )
     msg = r"dml_long does not contain the necessary sensitivity elements\. Expected numpy\.ndarray for key nu2\."
     with pytest.raises(TypeError, match=msg):
@@ -87,7 +87,7 @@ def test_doubleml_exception_data():
                 'sigma2': np.random.normal(size=(n_obs + 1, n_rep, n_coef)),
                 'nu2': np.random.normal(size=(n_obs, n_rep, n_coef))
             },
-            all_coef=np.random.normal(size=(n_coef, n_rep))
+            all_coef=np.random.normal(size=(n_rep, n_coef))
         )
     msg = (r"dml_long does not contain the necessary sensitivity elements\. "
            r"Expected 3 dimensions of shape \(1, n_coef, n_rep\) for key sigma2\.")
@@ -103,7 +103,7 @@ def test_doubleml_exception_data():
                 'sigma2': np.random.normal(size=(n_obs, n_rep, n_coef)),
                 'nu2': np.random.normal(size=(n_obs + 1, n_rep, n_coef))
             },
-            all_coef=np.random.normal(size=(n_coef, n_rep))
+            all_coef=np.random.normal(size=(n_rep, n_coef))
         )
     msg = (r"dml_long does not contain the necessary sensitivity elements\. "
            r"Expected 3 dimensions of shape \(1, n_coef, n_rep\) for key nu2\.")
@@ -120,7 +120,7 @@ def test_doubleml_exception_data():
                 'sigma2': np.random.normal(size=(n_obs, n_rep + 1, n_coef)),
                 'nu2': np.random.normal(size=(n_obs, n_rep, n_coef))
             },
-            all_coef=np.random.normal(size=(n_coef, n_rep))
+            all_coef=np.random.normal(size=(n_rep, n_coef))
         )
     msg = r"dml_long and dml_short do not contain the same shape of sensitivity elements\. "
     msg += r"Shapes of sigma2 are: \(1, 4, 5\) and \(1, 3, 5\)"
@@ -136,7 +136,7 @@ def test_doubleml_exception_data():
                 'sigma2': np.random.normal(size=(n_obs, n_rep, n_coef)),
                 'nu2': np.random.normal(size=(n_obs, n_rep + 1, n_coef))
             },
-            all_coef=np.random.normal(size=(n_coef, n_rep))
+            all_coef=np.random.normal(size=(n_rep, n_coef))
         )
     msg = r"dml_long and dml_short do not contain the same shape of sensitivity elements\. "
     msg += r"Shapes of nu2 are: \(1, 4, 5\) and \(1, 3, 5\)"
@@ -168,11 +168,11 @@ def test_doubleml_exception_data():
                 'sigma2': np.random.normal(size=(n_obs, n_rep, n_coef)),
                 'nu2': np.random.normal(size=(n_obs, n_rep, n_coef))
             },
-            all_coef=np.random.normal(size=(n_coef + 1, n_rep))
+            all_coef=np.random.normal(size=(n_rep, n_coef + 1))
         )
-    msg = r"dml_long\.all_coef does not contain the necessary coefficients\. Expected shape: \(5, 3\)"
+    msg = r"dml_long\.all_coef does not contain the necessary coefficients\. Expected shape: \(3, 5\)"
     with pytest.raises(ValueError, match=msg):
         _ = gain_statistics(dml_incorrect, dml_correct)
-    msg = r"dml_short\.all_coef does not contain the necessary coefficients\. Expected shape: \(5, 3\)"
+    msg = r"dml_short\.all_coef does not contain the necessary coefficients\. Expected shape: \(3, 5\)"
     with pytest.raises(ValueError, match=msg):
         _ = gain_statistics(dml_correct, dml_incorrect)

From 53b4e4acc00280aa1faca16002418c1ea9b2ab11 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Wed, 31 Jul 2024 11:44:01 +0200
Subject: [PATCH 86/98] Update gain_statistics.py

---
 doubleml/utils/gain_statistics.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doubleml/utils/gain_statistics.py b/doubleml/utils/gain_statistics.py
index 5a05e1b2a..bfd388455 100644
--- a/doubleml/utils/gain_statistics.py
+++ b/doubleml/utils/gain_statistics.py
@@ -79,12 +79,12 @@ def gain_statistics(dml_long, dml_short):
     # Gain statistics
     all_cf_y_benchmark = np.clip(np.divide((R2_y_long - R2_y_short), (1.0 - R2_y_long)), 0, 1)
     all_cf_d_benchmark = np.clip(np.divide((1.0 - R2_riesz), R2_riesz), 0, 1)
-    cf_y_benchmark = np.median(all_cf_y_benchmark, axis=0)
-    cf_d_benchmark = np.median(all_cf_d_benchmark, axis=0)
+    cf_y_benchmark = np.median(all_cf_y_benchmark, axis=1)
+    cf_d_benchmark = np.median(all_cf_d_benchmark, axis=1)
 
     # change in estimates (slightly different to paper)
     all_delta_theta = dml_short.all_coef - dml_long.all_coef
-    delta_theta = np.median(all_delta_theta, axis=0)
+    delta_theta = np.median(all_delta_theta, axis=1)
 
     # degree of adversity
     var_g = var_y_residuals_short - var_y_residuals_long
@@ -97,7 +97,7 @@ def gain_statistics(dml_long, dml_short):
                                    where=denom != 0),
                          0.0, 1.0)
     all_rho_benchmark = np.multiply(rho_values, rho_sign)
-    rho_benchmark = np.median(all_rho_benchmark, axis=0)
+    rho_benchmark = np.median(all_rho_benchmark, axis=1)
     benchmark_dict = {
         "cf_y": cf_y_benchmark,
         "cf_d": cf_d_benchmark,

From 9911b69b8b30e08638c2fa3c1510af5f5447d314 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Wed, 31 Jul 2024 11:44:14 +0200
Subject: [PATCH 87/98] add benchmarking to apos

---
 doubleml/irm/apos.py            | 41 ++++++++++++++++++++++++++
 doubleml/irm/tests/test_apos.py | 51 +++++++++++++++++++++------------
 2 files changed, 73 insertions(+), 19 deletions(-)

diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py
index c44198bc9..c6337100e 100644
--- a/doubleml/irm/apos.py
+++ b/doubleml/irm/apos.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pandas as pd
+import copy
 from collections.abc import Iterable
 
 from sklearn.base import clone
@@ -14,6 +15,7 @@
 from ..utils.resampling import DoubleMLResampling
 from ..utils._descriptive import generate_summary
 from ..utils._checks import _check_score, _check_trimming, _check_weights, _check_sample_splitting
+from ..utils.gain_statistics import gain_statistics
 
 
 class DoubleMLAPOS:
@@ -562,6 +564,45 @@ def sensitivity_plot(self, idx_treatment=0, value='theta', rho=1.0, level=0.95,
 
         return fig
 
+    def sensitivity_benchmark(self, benchmarking_set, fit_args=None):
+        """
+        Computes a benchmark for a given set of features.
+        Returns a DataFrame containing the corresponding values for cf_y, cf_d, rho and the change in estimates.
+        Returns
+        -------
+        benchmark_results : pandas.DataFrame
+            Benchmark results.
+        """
+        x_list_long = self._dml_data.x_cols
+
+        # input checks
+        if self.sensitivity_elements is None:
+            raise NotImplementedError(f'Sensitivity analysis not yet implemented for {self.__class__.__name__}.')
+        if not isinstance(benchmarking_set, list):
+            raise TypeError('benchmarking_set must be a list. '
+                            f'{str(benchmarking_set)} of type {type(benchmarking_set)} was passed.')
+        if len(benchmarking_set) == 0:
+            raise ValueError('benchmarking_set must not be empty.')
+        if not set(benchmarking_set) <= set(x_list_long):
+            raise ValueError(f"benchmarking_set must be a subset of features {str(self._dml_data.x_cols)}. "
+                             f'{str(benchmarking_set)} was passed.')
+        if fit_args is not None and not isinstance(fit_args, dict):
+            raise TypeError('fit_args must be a dict. '
+                            f'{str(fit_args)} of type {type(fit_args)} was passed.')
+
+        # refit short form of the model
+        x_list_short = [x for x in x_list_long if x not in benchmarking_set]
+        dml_short = copy.deepcopy(self)
+        dml_short._dml_data.x_cols = x_list_short
+        if fit_args is not None:
+            dml_short.fit(**fit_args)
+        else:
+            dml_short.fit()
+
+        benchmark_dict = gain_statistics(dml_long=self, dml_short=dml_short)
+        df_benchmark = pd.DataFrame(benchmark_dict, index=self.treatment_levels)
+        return df_benchmark
+
     def draw_sample_splitting(self):
         """
         Draw sample splitting for DoubleML models.
diff --git a/doubleml/irm/tests/test_apos.py b/doubleml/irm/tests/test_apos.py
index 9ebc7591e..92a372ff1 100644
--- a/doubleml/irm/tests/test_apos.py
+++ b/doubleml/irm/tests/test_apos.py
@@ -75,7 +75,7 @@ def learner(request):
 
 
 @pytest.fixture(scope='module',
-                params=[1])
+                params=[1, 5])
 def n_rep(request):
     return request.param
 
@@ -148,6 +148,7 @@ def dml_apos_fixture(learner, n_rep, normalize_ipw, trimming_threshold, treatmen
         clone(learner[0]), clone(learner[1]),
         treatment_levels=treatment_levels,
         all_smpls=all_smpls,
+        n_rep=n_rep,
         score='APO',
         trimming_rule='truncate',
         normalize_ipw=normalize_ipw,
@@ -176,23 +177,24 @@ def dml_apos_fixture(learner, n_rep, normalize_ipw, trimming_threshold, treatmen
         'apos_model': dml_obj,
         'unfitted_apos_model': unfitted_apos_model
     }
+    if n_rep == 1:
+        for bootstrap in boot_methods:
+            np.random.seed(42)
+            boot_t_stat = boot_apos(res_manual['apo_scaled_score'], res_manual['all_se'], treatment_levels,
+                                    all_smpls, n_rep, bootstrap, n_rep_boot)
 
-    for bootstrap in boot_methods:
-        np.random.seed(42)
-        boot_t_stat = boot_apos(res_manual['apo_scaled_score'], res_manual['all_se'], treatment_levels,
-                                all_smpls, n_rep, bootstrap, n_rep_boot)
+            np.random.seed(42)
+            dml_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
 
-        np.random.seed(42)
-        dml_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
+            res_dict['boot_t_stat_' + bootstrap] = dml_obj.boot_t_stat
+            res_dict['boot_t_stat_' + bootstrap + '_manual'] = boot_t_stat
 
-        res_dict['boot_t_stat_' + bootstrap] = dml_obj.boot_t_stat
-        res_dict['boot_t_stat_' + bootstrap + '_manual'] = boot_t_stat
-
-        ci = dml_obj.confint(joint=True, level=0.95)
-        ci_manual = confint_manual(res_manual['apos'], res_manual['se'], treatment_levels,
-                                   boot_t_stat=boot_t_stat, joint=True, level=0.95)
-        res_dict['boot_ci_' + bootstrap] = ci.to_numpy()
-        res_dict['boot_ci_' + bootstrap + '_manual'] = ci_manual.to_numpy()
+            ci = dml_obj.confint(joint=True, level=0.95)
+            ci_manual = confint_manual(
+                res_manual['apos'], res_manual['se'], treatment_levels,
+                boot_t_stat=boot_t_stat, joint=True, level=0.95)
+            res_dict['boot_ci_' + bootstrap] = ci.to_numpy()
+            res_dict['boot_ci_' + bootstrap + '_manual'] = ci_manual.to_numpy()
 
     # causal contrasts
     if len(treatment_levels) > 1:
@@ -216,6 +218,8 @@ def test_dml_apos_coef(dml_apos_fixture):
 
 @pytest.mark.ci
 def test_dml_apos_se(dml_apos_fixture):
+    if dml_apos_fixture['n_rep'] != 1:
+        pytest.skip("Skipping test as n_rep is not 1")
     assert np.allclose(dml_apos_fixture['se'],
                        dml_apos_fixture['se_manual'],
                        rtol=1e-9, atol=1e-9)
@@ -226,6 +230,8 @@ def test_dml_apos_se(dml_apos_fixture):
 
 @pytest.mark.ci
 def test_dml_apos_boot(dml_apos_fixture):
+    if dml_apos_fixture['n_rep'] != 1:
+        pytest.skip("Skipping test as n_rep is not 1")
     for bootstrap in dml_apos_fixture['boot_methods']:
         assert np.allclose(dml_apos_fixture['boot_t_stat_' + bootstrap],
                            dml_apos_fixture['boot_t_stat_' + bootstrap + '_manual'],
@@ -234,6 +240,8 @@ def test_dml_apos_boot(dml_apos_fixture):
 
 @pytest.mark.ci
 def test_dml_apos_ci(dml_apos_fixture):
+    if dml_apos_fixture['n_rep'] != 1:
+        pytest.skip("Skipping test as n_rep is not 1")
     for bootstrap in dml_apos_fixture['boot_methods']:
         assert np.allclose(dml_apos_fixture['ci'],
                            dml_apos_fixture['ci_manual'],
@@ -260,18 +268,23 @@ def test_doubleml_apos_return_types(dml_apos_fixture):
         assert isinstance(dml_apos_fixture['causal_contrast_single'], dml.DoubleMLFramework)
         assert isinstance(dml_apos_fixture['causal_contrast_multiple'], dml.DoubleMLFramework)
 
+    benchmark = dml_apos_fixture['apos_model'].sensitivity_benchmark(benchmarking_set=['x1'])
+    assert isinstance(benchmark, pd.DataFrame)
+
 
 @pytest.mark.ci
 def test_doubleml_apos_causal_contrast(dml_apos_fixture):
     if dml_apos_fixture['n_treatment_levels'] == 1:
         pytest.skip("Skipping test as n_treatment_levels is 1")
 
-    acc_single = dml_apos_fixture['coef'][1:] - dml_apos_fixture['coef'][0]
-    assert np.allclose(dml_apos_fixture['causal_contrast_single'].thetas,
+    acc_single = dml_apos_fixture['apos_model'].all_coef[1:, ] - dml_apos_fixture['apos_model'].all_coef[0, ]
+    assert np.allclose(dml_apos_fixture['causal_contrast_single'].all_thetas,
                        acc_single,
                        rtol=1e-9, atol=1e-9)
 
-    acc_multiple = np.append(acc_single, dml_apos_fixture['coef'][2] - dml_apos_fixture['coef'][1])
-    assert np.allclose(dml_apos_fixture['causal_contrast_multiple'].thetas,
+    acc_multiple = np.append(acc_single,
+                             dml_apos_fixture['apos_model'].all_coef[2:3, ] - dml_apos_fixture['apos_model'].all_coef[1:2, ],
+                             axis=0)
+    assert np.allclose(dml_apos_fixture['causal_contrast_multiple'].all_thetas,
                        acc_multiple,
                        rtol=1e-9, atol=1e-9)

From 31b386a948ebb9b7f738b0d952fd0d84db8d25b3 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Wed, 31 Jul 2024 12:11:56 +0200
Subject: [PATCH 88/98] add _all_treatments to apos

---
 doubleml/irm/apos.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py
index c6337100e..2727c4e22 100644
--- a/doubleml/irm/apos.py
+++ b/doubleml/irm/apos.py
@@ -39,6 +39,7 @@ def __init__(self,
         self._is_cluster_data = isinstance(obj_dml_data, DoubleMLClusterData)
         self._check_data(self._dml_data)
 
+        self._all_treatments = np.unique(self._dml_data.d)
         self._treatment_levels = self._check_treatment_levels(treatment_levels)
         self._n_treatment_levels = len(self._treatment_levels)
 
@@ -739,7 +740,7 @@ def _check_treatment_levels(self, treatment_levels):
             treatment_level_list = [treatment_levels]
         else:
             treatment_level_list = [t_lvl for t_lvl in treatment_levels]
-        is_d_subset = set(treatment_level_list).issubset(set(np.unique(self._dml_data.d)))
+        is_d_subset = set(treatment_level_list).issubset(set(self._all_treatments))
         if not is_d_subset:
             raise ValueError('Invalid reference_levels. reference_levels has to be an iterable subset or '
                              'a single element of the unique treatment levels in the data.')

From 738edf34282f7b489a3c9c53ed18931ddcb42b26 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Wed, 31 Jul 2024 13:13:00 +0200
Subject: [PATCH 89/98] add exception tests for external predicitons

---
 doubleml/irm/apos.py                       | 70 +++++++++++++++++++---
 doubleml/irm/tests/test_apos_exceptions.py | 44 ++++++++++++++
 2 files changed, 105 insertions(+), 9 deletions(-)

diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py
index 2727c4e22..27f4d5c12 100644
--- a/doubleml/irm/apos.py
+++ b/doubleml/irm/apos.py
@@ -39,9 +39,12 @@ def __init__(self,
         self._is_cluster_data = isinstance(obj_dml_data, DoubleMLClusterData)
         self._check_data(self._dml_data)
 
-        self._all_treatments = np.unique(self._dml_data.d)
+        self._all_treatment_levels = np.unique(self._dml_data.d)
+
         self._treatment_levels = self._check_treatment_levels(treatment_levels)
         self._n_treatment_levels = len(self._treatment_levels)
+        # Check if there are elements in self._all_treatments that are not in self.treatment_levels
+        self._add_treatment_levels = [t for t in self._all_treatment_levels if t not in self._treatment_levels]
 
         self._normalize_ipw = normalize_ipw
         self._n_folds = n_folds
@@ -366,8 +369,14 @@ def fit(self, n_jobs_models=None, n_jobs_cv=None, store_predictions=True, store_
             to analyze the fitted models or extract information like variable importance.
             Default is ``False``.
 
-        external_predictions : None
-            Not implemented for DoubleMLAPOS.
+        external_predictions : dict or None
+            A nested dictionary where the keys correspond the the treatment levels and contain predictions according to each
+            treatment level. The values have to be dictionaries which containkeys ``'ml_g'`` and ``'ml_m'``.
+            The predictions for ``'ml_m'`` are passed directly to the DoubleMLAPO model,
+            whereas the predictions for ``'ml_g'`` are used to compute predictions for ``'ml_g1'`` and ``'ml_g0'``.
+            If the treatment levels do not cover all levels in the data, combined predictions for ``'ml_g'`` have
+            to be provided under the key ``'add_treatment_levels'``.
+            Default is `None`.
 
         Returns
         -------
@@ -375,12 +384,20 @@ def fit(self, n_jobs_models=None, n_jobs_cv=None, store_predictions=True, store_
         """
 
         if external_predictions is not None:
-            raise NotImplementedError(f"External predictions not implemented for {self.__class__.__name__}.")
+            self._check_external_predictions(external_predictions)
+            ext_pred_dict = self._recompute_external_predictions(self)
 
         # parallel estimation of the models
         parallel = Parallel(n_jobs=n_jobs_models, verbose=0, pre_dispatch='2*n_jobs')
-        fitted_models = parallel(delayed(self._fit_model)(i_level, n_jobs_cv, store_predictions, store_models)
-                                 for i_level in range(self.n_treatment_levels))
+        fitted_models = parallel(
+            delayed(self._fit_model)(
+                i_level,
+                n_jobs_cv,
+                store_predictions,
+                store_models,
+                ext_pred_dict)
+            for i_level in range(self.n_treatment_levels)
+        )
 
         # combine the estimates and scores
         framework_list = [None] * self.n_treatment_levels
@@ -728,10 +745,15 @@ def causal_contrast(self, reference_levels):
         acc.treatment_names = all_treatment_names
         return acc
 
-    def _fit_model(self, i_level, n_jobs_cv=None, store_predictions=True, store_models=False):
+    def _fit_model(self, i_level, n_jobs_cv=None, store_predictions=True, store_models=False, external_predictions_dict=None):
 
         model = self.modellist[i_level]
-        model.fit(n_jobs_cv=n_jobs_cv, store_predictions=store_predictions, store_models=store_models)
+        if external_predictions_dict is not None:
+            external_predictions = external_predictions_dict[self.treatment_levels[i_level]]
+        else:
+            external_predictions = None
+        model.fit(n_jobs_cv=n_jobs_cv, store_predictions=store_predictions, store_models=store_models,
+                  external_predictions=external_predictions)
         return model
 
     def _check_treatment_levels(self, treatment_levels):
@@ -740,7 +762,7 @@ def _check_treatment_levels(self, treatment_levels):
             treatment_level_list = [treatment_levels]
         else:
             treatment_level_list = [t_lvl for t_lvl in treatment_levels]
-        is_d_subset = set(treatment_level_list).issubset(set(self._all_treatments))
+        is_d_subset = set(treatment_level_list).issubset(set(self._all_treatment_levels))
         if not is_d_subset:
             raise ValueError('Invalid reference_levels. reference_levels has to be an iterable subset or '
                              'a single element of the unique treatment levels in the data.')
@@ -753,6 +775,36 @@ def _check_data(self, obj_dml_data):
             raise ValueError('The data must not contain instrumental variables.')
         return
 
+    def _check_external_predictions(self, external_predictions):
+        expected_keys = self.treatment_levels
+        if len(self._add_treatment_levels) > 0:
+            expected_keys += ['add_treatment_levels']
+        if not isinstance(external_predictions, dict):
+            raise TypeError('external_predictions must be a dictionary. ' +
+                            f'Object of type {type(external_predictions)} passed.')
+
+        if not set(external_predictions.keys()) == set(expected_keys):
+            raise ValueError('external_predictions must contain predictions for all treatment levels. ' +
+                             f'Expected keys: {set(expected_keys)}. ' +
+                             f'Passed keys: {set(external_predictions.keys())}.')
+
+        contains_ml_g = ['ml_g' in external_predictions[treatment_level] for treatment_level in self.treatment_levels]
+        if not all(contains_ml_g) and not all([not contains for contains in contains_ml_g]):
+            raise ValueError('The predictions for ml_g have to provided for all treatment levels or not at all.')
+        return
+
+    def _recompute_external_predictions(self, external_predictions):
+        ext_pred_dict = {}
+        for i_level in range(self.n_treatment_levels):
+            ext_pred_dict[self.treatment_levels[i_level]] = {
+                'ml_g1': external_predictions[self.treatment_levels[i_level]]['ml_g'],
+                'ml_m': external_predictions[self.treatment_levels[i_level]]['ml_m']
+            }
+            ext_pred_dict[self.treatment_levels[i_level]]['ml_g0'] = \
+                external_predictions[self.treatment_levels[i_level]]['ml_g']
+
+        return ext_pred_dict
+
     def _initialize_weights(self, weights):
         if weights is None:
             weights = np.ones(self._dml_data.n_obs)
diff --git a/doubleml/irm/tests/test_apos_exceptions.py b/doubleml/irm/tests/test_apos_exceptions.py
index 7b3c8bdb0..752f77240 100644
--- a/doubleml/irm/tests/test_apos_exceptions.py
+++ b/doubleml/irm/tests/test_apos_exceptions.py
@@ -109,6 +109,50 @@ def test_apos_exception_properties_and_methods():
         dml_obj.sensitivity_summary
 
 
+@pytest.mark.ci
+def test_apos_exception_ext_pred():
+    dml_obj = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=0)
+    external_predictions = [0, 1]
+    msg = r'external_predictions must be a dictionary. Object of type <class \'list\'> passed.'
+    with pytest.raises(TypeError, match=msg):
+        dml_obj.fit(external_predictions=external_predictions)
+
+    # test with a level subset
+    external_predictions = {
+        0: "dummy",
+        1: "dummy"
+    }
+    msg = (
+        r"external_predictions must contain predictions for all treatment levels\. "
+        r"Expected keys: \{0, 'add_treatment_levels'\}\. "
+        r"Passed keys: \{0, 1\}\."
+    )
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.fit(external_predictions=external_predictions)
+
+    external_predictions = {
+        0: {"ml_g": "dummy"},
+        'add_treatment_levels': {"ml_m": "dummy"}
+    }
+    msg = "The predictions for ml_g have to provided for all treatment levels or not at all."
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.fit(external_predictions=external_predictions)
+
+    # test with all levels
+    dml_obj = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=[0, 1, 2, 3])
+    external_predictions = {
+        0: "dummy",
+        1: "dummy"
+    }
+    msg = (
+        r"external_predictions must contain predictions for all treatment levels\. "
+        r"Expected keys: \{0, 1, 2, 3\}\. "
+        r"Passed keys: \{0, 1\}\."
+    )
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.fit(external_predictions=external_predictions)
+
+
 @pytest.mark.ci
 def test_causal_contrast_exceptions():
     msg = r"Apply fit\(\) before causal_contrast\(\)."

From e18aba113a63de76f2de5ae6b26d98de7d6fa7a9 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Wed, 31 Jul 2024 14:26:54 +0200
Subject: [PATCH 90/98] add exception for multiple treatment variables in apo
 setting

---
 doubleml/irm/apo.py                       | 4 ++++
 doubleml/irm/tests/test_apo_exceptions.py | 5 +++++
 2 files changed, 9 insertions(+)

diff --git a/doubleml/irm/apo.py b/doubleml/irm/apo.py
index f52750709..b0f4f3817 100644
--- a/doubleml/irm/apo.py
+++ b/doubleml/irm/apo.py
@@ -367,6 +367,10 @@ def _nuisance_tuning(self, smpls, param_grids, scoring_methods, n_folds_tune, n_
         return res
 
     def _check_data(self, obj_dml_data):
+        if len(obj_dml_data.d_cols) > 1:
+            raise ValueError('Only one treatment variable is allowed. ' +
+                             f'Got {len(obj_dml_data.d_cols)} treatment variables.')
+
         if obj_dml_data.z_cols is not None:
             raise ValueError('Incompatible data. ' +
                              ' and '.join(obj_dml_data.z_cols) +
diff --git a/doubleml/irm/tests/test_apo_exceptions.py b/doubleml/irm/tests/test_apo_exceptions.py
index 970ba1900..31fa6b447 100644
--- a/doubleml/irm/tests/test_apo_exceptions.py
+++ b/doubleml/irm/tests/test_apo_exceptions.py
@@ -25,6 +25,11 @@ def test_apo_exception_data():
     with pytest.raises(TypeError, match=msg):
         _ = DoubleMLAPO(pd.DataFrame(), ml_g, ml_m, treatment_level=0)
 
+    msg = 'Only one treatment variable is allowed. Got 2 treatment variables.'
+    with pytest.raises(ValueError, match=msg):
+        dml_data_multiple = DoubleMLData(df_apo, 'y', ['d', 'x1'])
+        _ = DoubleMLAPO(dml_data_multiple, ml_g, ml_m, treatment_level=0)
+
     dml_data_z = make_iivm_data()
     msg = r'Incompatible data. z have been set as instrumental variable\(s\).'
     with pytest.raises(ValueError, match=msg):

From bfa57dafc84825b728c16cd54237be338712ea11 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Wed, 31 Jul 2024 14:32:00 +0200
Subject: [PATCH 91/98] add simple external predictions to apos

---
 doubleml/irm/apos.py                          |  22 ++--
 .../tests/test_apos_external_predictions.py   | 103 ++++++++++++++++++
 2 files changed, 116 insertions(+), 9 deletions(-)
 create mode 100644 doubleml/irm/tests/test_apos_external_predictions.py

diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py
index 27f4d5c12..1b5f7e0b7 100644
--- a/doubleml/irm/apos.py
+++ b/doubleml/irm/apos.py
@@ -385,7 +385,9 @@ def fit(self, n_jobs_models=None, n_jobs_cv=None, store_predictions=True, store_
 
         if external_predictions is not None:
             self._check_external_predictions(external_predictions)
-            ext_pred_dict = self._recompute_external_predictions(self)
+            ext_pred_dict = self._recompute_external_predictions(external_predictions)
+        else:
+            ext_pred_dict = None
 
         # parallel estimation of the models
         parallel = Parallel(n_jobs=n_jobs_models, verbose=0, pre_dispatch='2*n_jobs')
@@ -794,14 +796,16 @@ def _check_external_predictions(self, external_predictions):
         return
 
     def _recompute_external_predictions(self, external_predictions):
-        ext_pred_dict = {}
-        for i_level in range(self.n_treatment_levels):
-            ext_pred_dict[self.treatment_levels[i_level]] = {
-                'ml_g1': external_predictions[self.treatment_levels[i_level]]['ml_g'],
-                'ml_m': external_predictions[self.treatment_levels[i_level]]['ml_m']
-            }
-            ext_pred_dict[self.treatment_levels[i_level]]['ml_g0'] = \
-                external_predictions[self.treatment_levels[i_level]]['ml_g']
+        d_col = self._dml_data.d_cols[0]
+        ext_pred_dict = {treatment_level: {d_col: {}} for treatment_level in self.treatment_levels}
+        for treatment_level in self.treatment_levels:
+            if "ml_g1" in external_predictions[treatment_level]:
+                ext_pred_dict[treatment_level][d_col]['ml_g1'] = external_predictions[treatment_level]['ml_g1']
+            if "ml_m" in external_predictions[treatment_level]:
+                ext_pred_dict[treatment_level][d_col]['ml_m'] = external_predictions[treatment_level]['ml_m']
+            if "ml_g0" in external_predictions[treatment_level]:
+                ext_pred_dict[treatment_level][d_col]['ml_g0'] = external_predictions[treatment_level]['ml_g0']
+            # TODO: Combine the models
 
         return ext_pred_dict
 
diff --git a/doubleml/irm/tests/test_apos_external_predictions.py b/doubleml/irm/tests/test_apos_external_predictions.py
new file mode 100644
index 000000000..1aa8d4ba1
--- /dev/null
+++ b/doubleml/irm/tests/test_apos_external_predictions.py
@@ -0,0 +1,103 @@
+import pytest
+import numpy as np
+import pandas as pd
+import math
+
+from sklearn.linear_model import LinearRegression, LogisticRegression
+from doubleml import DoubleMLAPOS, DoubleMLData
+from doubleml.datasets import make_irm_data_discrete_treatments
+from doubleml.utils import DMLDummyRegressor, DMLDummyClassifier
+
+from ...tests._utils import draw_smpls
+
+
+@pytest.fixture(scope="module", params=[1, 3])
+def n_rep(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[[0, 1, 2, 3], [0, 1]])
+def treatment_levels(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def set_ml_m_ext(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def set_ml_g_ext(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def doubleml_apos_ext_fixture(n_rep, treatment_levels, set_ml_m_ext, set_ml_g_ext):
+    score = "APO"
+    ext_predictions = {
+        treatment_level: {} for treatment_level in treatment_levels
+    }
+
+    np.random.seed(3141)
+    n_obs = 500
+    data_apo = make_irm_data_discrete_treatments(n_obs=n_obs)
+    df_apo = pd.DataFrame(
+        np.column_stack((data_apo['y'], data_apo['d'], data_apo['x'])),
+        columns=['y', 'd'] + ['x' + str(i) for i in range(data_apo['x'].shape[1])]
+    )
+
+    dml_data = DoubleMLData(df_apo, 'y', 'd')
+    d = data_apo['d']
+    all_smpls = draw_smpls(n_obs, n_folds=5, n_rep=n_rep, groups=d)
+
+    kwargs = {
+        "obj_dml_data": dml_data,
+        "score": score,
+        "treatment_levels": treatment_levels,
+        "n_rep": n_rep,
+        "draw_sample_splitting": False
+    }
+
+    dml_obj = DoubleMLAPOS(ml_g=LinearRegression(), ml_m=LogisticRegression(), **kwargs)
+    dml_obj.set_sample_splitting(all_smpls=all_smpls)
+
+    np.random.seed(3141)
+    dml_obj.fit(store_predictions=True)
+
+    if set_ml_m_ext:
+        for i_treatment_level, treatment_level in enumerate(treatment_levels):
+            ext_predictions[treatment_level]["ml_m"] = dml_obj.modellist[i_treatment_level].predictions["ml_m"][:, :, 0]
+        ml_m = DMLDummyClassifier()
+    else:
+        ml_m = LogisticRegression(random_state=42)
+
+    if set_ml_g_ext:
+        for i_treatment_level, treatment_level in enumerate(treatment_levels):
+            ext_predictions[treatment_level]["ml_g0"] = dml_obj.modellist[i_treatment_level].predictions["ml_g0"][:, :, 0]
+            ext_predictions[treatment_level]["ml_g1"] = dml_obj.modellist[i_treatment_level].predictions["ml_g1"][:, :, 0]
+        ml_g = DMLDummyRegressor()
+    else:
+        ml_g = LinearRegression()
+
+    dml_obj_ext = DoubleMLAPOS(ml_g=ml_g, ml_m=ml_m, **kwargs)
+    dml_obj_ext.set_sample_splitting(all_smpls=all_smpls)
+
+    np.random.seed(3141)
+    dml_obj_ext.fit(external_predictions=ext_predictions)
+
+    res_dict = {
+        "coef_normal": dml_obj.coef[0],
+        "coef_ext": dml_obj_ext.coef[0]
+    }
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_doubleml_apos_ext_coef(doubleml_apos_ext_fixture):
+    assert math.isclose(
+        doubleml_apos_ext_fixture["coef_normal"],
+        doubleml_apos_ext_fixture["coef_ext"],
+        rel_tol=1e-9,
+        abs_tol=1e-4
+    )

From 6f8d3f4c7fd7232f01d9c408277d2ebe1207687a Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Wed, 31 Jul 2024 14:36:49 +0200
Subject: [PATCH 92/98] fix sensitivity_summary tests

---
 doubleml/irm/tests/test_apos_exceptions.py | 2 +-
 doubleml/tests/test_exceptions.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/doubleml/irm/tests/test_apos_exceptions.py b/doubleml/irm/tests/test_apos_exceptions.py
index 752f77240..716364cf4 100644
--- a/doubleml/irm/tests/test_apos_exceptions.py
+++ b/doubleml/irm/tests/test_apos_exceptions.py
@@ -106,7 +106,7 @@ def test_apos_exception_properties_and_methods():
         dml_obj.sensitivity_plot()
     msg = r'Apply sensitivity_analysis\(\) before sensitivity_summary.'
     with pytest.raises(ValueError, match=msg):
-        dml_obj.sensitivity_summary
+        _ = dml_obj.sensitivity_summary
 
 
 @pytest.mark.ci
diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py
index 97b8dac41..cacd1edfa 100644
--- a/doubleml/tests/test_exceptions.py
+++ b/doubleml/tests/test_exceptions.py
@@ -1111,7 +1111,7 @@ def test_doubleml_sensitivity_summary():
     dml_irm = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression(), trimming_threshold=0.1)
     msg = r'Apply sensitivity_analysis\(\) before sensitivity_summary.'
     with pytest.raises(ValueError, match=msg):
-        _ = dml_irm.sensitivity_summary()
+        _ = dml_irm.sensitivity_summary
 
 
 @pytest.mark.ci

From e25663c3dbf032176556338379ff7f45a34234f9 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Thu, 1 Aug 2024 08:33:02 +0200
Subject: [PATCH 93/98] add more restrictions on ext predictions for apos

---
 doubleml/irm/apos.py                       | 20 ++++++++++---------
 doubleml/irm/tests/test_apos_exceptions.py | 23 ++++++++++++++--------
 2 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py
index 1b5f7e0b7..7a91826b7 100644
--- a/doubleml/irm/apos.py
+++ b/doubleml/irm/apos.py
@@ -374,8 +374,6 @@ def fit(self, n_jobs_models=None, n_jobs_cv=None, store_predictions=True, store_
             treatment level. The values have to be dictionaries which containkeys ``'ml_g'`` and ``'ml_m'``.
             The predictions for ``'ml_m'`` are passed directly to the DoubleMLAPO model,
             whereas the predictions for ``'ml_g'`` are used to compute predictions for ``'ml_g1'`` and ``'ml_g0'``.
-            If the treatment levels do not cover all levels in the data, combined predictions for ``'ml_g'`` have
-            to be provided under the key ``'add_treatment_levels'``.
             Default is `None`.
 
         Returns
@@ -779,20 +777,24 @@ def _check_data(self, obj_dml_data):
 
     def _check_external_predictions(self, external_predictions):
         expected_keys = self.treatment_levels
-        if len(self._add_treatment_levels) > 0:
-            expected_keys += ['add_treatment_levels']
         if not isinstance(external_predictions, dict):
             raise TypeError('external_predictions must be a dictionary. ' +
                             f'Object of type {type(external_predictions)} passed.')
 
-        if not set(external_predictions.keys()) == set(expected_keys):
-            raise ValueError('external_predictions must contain predictions for all treatment levels. ' +
+        if not set(external_predictions.keys()).issubset(set(expected_keys)):
+            raise ValueError('external_predictions must be a subset of all treatment levels. ' +
                              f'Expected keys: {set(expected_keys)}. ' +
                              f'Passed keys: {set(external_predictions.keys())}.')
 
-        contains_ml_g = ['ml_g' in external_predictions[treatment_level] for treatment_level in self.treatment_levels]
-        if not all(contains_ml_g) and not all([not contains for contains in contains_ml_g]):
-            raise ValueError('The predictions for ml_g have to provided for all treatment levels or not at all.')
+        expected_learner_keys = ['ml_g0', 'ml_g1', 'ml_m']
+        for key, value in external_predictions.items():
+            if not isinstance(value, dict):
+                raise TypeError(f'external_predictions[{key}] must be a dictionary. ' +
+                                f'Object of type {type(value)} passed.')
+            if not set(value.keys()).issubset(set(expected_learner_keys)):
+                raise ValueError(f'external_predictions[{key}] must be a subset of {set(expected_learner_keys)}. ' +
+                                 f'Passed keys: {set(value.keys())}.')
+
         return
 
     def _recompute_external_predictions(self, external_predictions):
diff --git a/doubleml/irm/tests/test_apos_exceptions.py b/doubleml/irm/tests/test_apos_exceptions.py
index 716364cf4..0c20efe53 100644
--- a/doubleml/irm/tests/test_apos_exceptions.py
+++ b/doubleml/irm/tests/test_apos_exceptions.py
@@ -123,18 +123,24 @@ def test_apos_exception_ext_pred():
         1: "dummy"
     }
     msg = (
-        r"external_predictions must contain predictions for all treatment levels\. "
-        r"Expected keys: \{0, 'add_treatment_levels'\}\. "
+        r"external_predictions must be a subset of all treatment levels\. "
+        r"Expected keys: \{0\}\. "
         r"Passed keys: \{0, 1\}\."
     )
     with pytest.raises(ValueError, match=msg):
         dml_obj.fit(external_predictions=external_predictions)
 
     external_predictions = {
-        0: {"ml_g": "dummy"},
-        'add_treatment_levels': {"ml_m": "dummy"}
+        0: "dummy",
+    }
+    msg = r'external_predictions\[0\] must be a dictionary. Object of type <class \'str\'> passed.'
+    with pytest.raises(TypeError, match=msg):
+        dml_obj.fit(external_predictions=external_predictions)
+
+    external_predictions = {
+        0: {"ml_g": "dummy"}
     }
-    msg = "The predictions for ml_g have to provided for all treatment levels or not at all."
+    msg = r"external_predictions\[0\] must be a subset of \{.*\}. Passed keys: \{'ml_g'\}\."
     with pytest.raises(ValueError, match=msg):
         dml_obj.fit(external_predictions=external_predictions)
 
@@ -142,12 +148,13 @@ def test_apos_exception_ext_pred():
     dml_obj = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=[0, 1, 2, 3])
     external_predictions = {
         0: "dummy",
-        1: "dummy"
+        1: "dummy",
+        4: "dummy"
     }
     msg = (
-        r"external_predictions must contain predictions for all treatment levels\. "
+        r"external_predictions must be a subset of all treatment levels\. "
         r"Expected keys: \{0, 1, 2, 3\}\. "
-        r"Passed keys: \{0, 1\}\."
+        r"Passed keys: \{0, 1, 4\}\."
     )
     with pytest.raises(ValueError, match=msg):
         dml_obj.fit(external_predictions=external_predictions)

From 07e919ab99f99646e6a66801363bda2d0b677aff Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Thu, 1 Aug 2024 08:59:14 +0200
Subject: [PATCH 94/98] finalize external predictions (docstrings and method
 names

---
 doubleml/irm/apos.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py
index 7a91826b7..2a6b5ce1a 100644
--- a/doubleml/irm/apos.py
+++ b/doubleml/irm/apos.py
@@ -370,10 +370,9 @@ def fit(self, n_jobs_models=None, n_jobs_cv=None, store_predictions=True, store_
             Default is ``False``.
 
         external_predictions : dict or None
-            A nested dictionary where the keys correspond the the treatment levels and contain predictions according to each
-            treatment level. The values have to be dictionaries which containkeys ``'ml_g'`` and ``'ml_m'``.
-            The predictions for ``'ml_m'`` are passed directly to the DoubleMLAPO model,
-            whereas the predictions for ``'ml_g'`` are used to compute predictions for ``'ml_g1'`` and ``'ml_g0'``.
+            A nested dictionary where the keys correspond the the treatment levels and can contain predictions according to
+            each treatment level. The values have to be dictionaries which can contain keys ``'ml_g0'``, ``'ml_g1'``
+            and ``'ml_m'``.
             Default is `None`.
 
         Returns
@@ -383,7 +382,7 @@ def fit(self, n_jobs_models=None, n_jobs_cv=None, store_predictions=True, store_
 
         if external_predictions is not None:
             self._check_external_predictions(external_predictions)
-            ext_pred_dict = self._recompute_external_predictions(external_predictions)
+            ext_pred_dict = self._rename_external_predictions(external_predictions)
         else:
             ext_pred_dict = None
 
@@ -797,7 +796,7 @@ def _check_external_predictions(self, external_predictions):
 
         return
 
-    def _recompute_external_predictions(self, external_predictions):
+    def _rename_external_predictions(self, external_predictions):
         d_col = self._dml_data.d_cols[0]
         ext_pred_dict = {treatment_level: {d_col: {}} for treatment_level in self.treatment_levels}
         for treatment_level in self.treatment_levels:
@@ -807,7 +806,6 @@ def _recompute_external_predictions(self, external_predictions):
                 ext_pred_dict[treatment_level][d_col]['ml_m'] = external_predictions[treatment_level]['ml_m']
             if "ml_g0" in external_predictions[treatment_level]:
                 ext_pred_dict[treatment_level][d_col]['ml_g0'] = external_predictions[treatment_level]['ml_g0']
-            # TODO: Combine the models
 
         return ext_pred_dict
 

From 7b8b330b2aa9761587a0bbaf845272e3f6773b49 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Thu, 1 Aug 2024 09:06:28 +0200
Subject: [PATCH 95/98] add evaluations for external predictions in DoubleMLAPO

---
 doubleml/irm/apo.py                             |  6 +++---
 .../irm/tests/test_apos_external_predictions.py | 17 ++++++++++++++++-
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/doubleml/irm/apo.py b/doubleml/irm/apo.py
index b0f4f3817..93c3c0df4 100644
--- a/doubleml/irm/apo.py
+++ b/doubleml/irm/apo.py
@@ -218,7 +218,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
         if g0_external:
             # use external predictions
             g_hat0 = {'preds': external_predictions['ml_g0'],
-                      'targets': None,
+                      'targets': _cond_targets(y, cond_sample=(treated == 0)),
                       'models': None}
         else:
             g_hat0 = _dml_cv_predict(self._learner['ml_g'], x, y, smpls=smpls_d0, n_jobs=n_jobs_cv,
@@ -233,7 +233,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
         if g1_external:
             # use external predictions
             g_hat1 = {'preds': external_predictions['ml_g1'],
-                      'targets': None,
+                      'targets': _cond_targets(y, cond_sample=(treated == 1)),
                       'models': None}
         else:
             g_hat1 = _dml_cv_predict(self._learner['ml_g'], x, y, smpls=smpls_d1, n_jobs=n_jobs_cv,
@@ -250,7 +250,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
         if m_external:
             # use external predictions
             m_hat = {'preds': external_predictions['ml_m'],
-                     'targets': None,
+                     'targets': treated,
                      'models': None}
         else:
             m_hat = _dml_cv_predict(self._learner['ml_m'], x, treated, smpls=smpls, n_jobs=n_jobs_cv,
diff --git a/doubleml/irm/tests/test_apos_external_predictions.py b/doubleml/irm/tests/test_apos_external_predictions.py
index 1aa8d4ba1..b6a2c8eed 100644
--- a/doubleml/irm/tests/test_apos_external_predictions.py
+++ b/doubleml/irm/tests/test_apos_external_predictions.py
@@ -87,7 +87,10 @@ def doubleml_apos_ext_fixture(n_rep, treatment_levels, set_ml_m_ext, set_ml_g_ex
 
     res_dict = {
         "coef_normal": dml_obj.coef[0],
-        "coef_ext": dml_obj_ext.coef[0]
+        "coef_ext": dml_obj_ext.coef[0],
+        "dml_obj": dml_obj,
+        "dml_obj_ext": dml_obj_ext,
+        "treatment_levels": treatment_levels
     }
 
     return res_dict
@@ -101,3 +104,15 @@ def test_doubleml_apos_ext_coef(doubleml_apos_ext_fixture):
         rel_tol=1e-9,
         abs_tol=1e-4
     )
+
+
+@pytest.mark.ci
+def test_doubleml_apos_ext_pred_nuisance(doubleml_apos_ext_fixture):
+    for i_level, _ in enumerate(doubleml_apos_ext_fixture["treatment_levels"]):
+        for nuisance_key in ["ml_g0", "ml_g1", "ml_m"]:
+            assert np.allclose(
+                doubleml_apos_ext_fixture["dml_obj"].modellist[i_level].nuisance_loss[nuisance_key],
+                doubleml_apos_ext_fixture["dml_obj_ext"].modellist[i_level].nuisance_loss[nuisance_key],
+                rtol=1e-9,
+                atol=1e-4
+            )

From ae5b2c0d8037a4a08bd255ed274e5a30b775b73a Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Thu, 1 Aug 2024 12:08:03 +0200
Subject: [PATCH 96/98] fix dimensions in gain_statistics

---
 doubleml/utils/gain_statistics.py             | 41 +++++++++++--------
 .../tests/test_exceptions_gain_statistics.py  | 11 +++--
 2 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/doubleml/utils/gain_statistics.py b/doubleml/utils/gain_statistics.py
index bfd388455..2fa233b33 100644
--- a/doubleml/utils/gain_statistics.py
+++ b/doubleml/utils/gain_statistics.py
@@ -19,44 +19,49 @@ def gain_statistics(dml_long, dml_short):
     benchmark_dict : dict
         Benchmarking dictionary (dict) with values for ``cf_d``, ``cf_y``, ``rho``, and ``delta_theta``.
     """
-    if not isinstance(dml_long.sensitivity_elements, dict):
+
+    # set input for readability
+    sensitivity_elements_long = dml_long.framework.sensitivity_elements
+    sensitivity_elements_short = dml_short.framework.sensitivity_elements
+
+    if not isinstance(sensitivity_elements_long, dict):
         raise TypeError("dml_long does not contain the necessary sensitivity elements. "
-                        "Expected dict for dml_long.sensitivity_elements.")
+                        "Expected dict for dml_long.framework.sensitivity_elements.")
     expected_keys = ['sigma2', 'nu2']
-    if not all(key in dml_long.sensitivity_elements.keys() for key in expected_keys):
+    if not all(key in sensitivity_elements_long.keys() for key in expected_keys):
         raise ValueError("dml_long does not contain the necessary sensitivity elements. "
                          "Required keys are: " + str(expected_keys))
-    if not isinstance(dml_short.sensitivity_elements, dict):
+    if not isinstance(sensitivity_elements_short, dict):
         raise TypeError("dml_short does not contain the necessary sensitivity elements. "
-                        "Expected dict for dml_short.sensitivity_elements.")
-    if not all(key in dml_short.sensitivity_elements.keys() for key in expected_keys):
+                        "Expected dict for dml_short.framework.sensitivity_elements.")
+    if not all(key in sensitivity_elements_short.keys() for key in expected_keys):
         raise ValueError("dml_short does not contain the necessary sensitivity elements. "
                          "Required keys are: " + str(expected_keys))
 
     for key in expected_keys:
-        if not isinstance(dml_long.sensitivity_elements[key], np.ndarray):
+        if not isinstance(sensitivity_elements_long[key], np.ndarray):
             raise TypeError("dml_long does not contain the necessary sensitivity elements. "
                             f"Expected numpy.ndarray for key {key}.")
-        if not isinstance(dml_short.sensitivity_elements[key], np.ndarray):
+        if not isinstance(sensitivity_elements_short[key], np.ndarray):
             raise TypeError("dml_short does not contain the necessary sensitivity elements. "
                             f"Expected numpy.ndarray for key {key}.")
-        if len(dml_long.sensitivity_elements[key].shape) != 3 or dml_long.sensitivity_elements[key].shape[0] != 1:
+        if len(sensitivity_elements_long[key].shape) != 3 or sensitivity_elements_long[key].shape[0] != 1:
             raise ValueError("dml_long does not contain the necessary sensitivity elements. "
                              f"Expected 3 dimensions of shape (1, n_coef, n_rep) for key {key}.")
-        if len(dml_short.sensitivity_elements[key].shape) != 3 or dml_short.sensitivity_elements[key].shape[0] != 1:
+        if len(sensitivity_elements_short[key].shape) != 3 or sensitivity_elements_short[key].shape[0] != 1:
             raise ValueError("dml_short does not contain the necessary sensitivity elements. "
                              f"Expected 3 dimensions of shape (1, n_coef, n_rep) for key {key}.")
-        if not np.array_equal(dml_long.sensitivity_elements[key].shape, dml_short.sensitivity_elements[key].shape):
+        if not np.array_equal(sensitivity_elements_long[key].shape, sensitivity_elements_short[key].shape):
             raise ValueError("dml_long and dml_short do not contain the same shape of sensitivity elements. "
-                             "Shapes of " + key + " are: " + str(dml_long.sensitivity_elements[key].shape) +
-                             " and " + str(dml_short.sensitivity_elements[key].shape))
+                             "Shapes of " + key + " are: " + str(sensitivity_elements_long[key].shape) +
+                             " and " + str(sensitivity_elements_short[key].shape))
 
     if not isinstance(dml_long.all_coef, np.ndarray):
         raise TypeError("dml_long.all_coef does not contain the necessary coefficients. Expected numpy.ndarray.")
     if not isinstance(dml_short.all_coef, np.ndarray):
         raise TypeError("dml_short.all_coef does not contain the necessary coefficients. Expected numpy.ndarray.")
 
-    expected_shape = (dml_long.sensitivity_elements['sigma2'].shape[1], dml_long.sensitivity_elements['sigma2'].shape[2])
+    expected_shape = (sensitivity_elements_long['sigma2'].shape[1], sensitivity_elements_long['sigma2'].shape[2])
     if dml_long.all_coef.shape != expected_shape:
         raise ValueError("dml_long.all_coef does not contain the necessary coefficients. Expected shape: " +
                          str(expected_shape))
@@ -66,10 +71,10 @@ def gain_statistics(dml_long, dml_short):
 
     # save elements for readability
     var_y = np.var(dml_long._dml_data.y)
-    var_y_residuals_long = np.squeeze(dml_long.sensitivity_elements['sigma2'], axis=0)
-    nu2_long = np.squeeze(dml_long.sensitivity_elements['nu2'], axis=0)
-    var_y_residuals_short = np.squeeze(dml_short.sensitivity_elements['sigma2'], axis=0)
-    nu2_short = np.squeeze(dml_short.sensitivity_elements['nu2'], axis=0)
+    var_y_residuals_long = np.squeeze(sensitivity_elements_long['sigma2'], axis=0)
+    nu2_long = np.squeeze(sensitivity_elements_long['nu2'], axis=0)
+    var_y_residuals_short = np.squeeze(sensitivity_elements_short['sigma2'], axis=0)
+    nu2_short = np.squeeze(sensitivity_elements_short['nu2'], axis=0)
 
     # compute nonparametric R2
     R2_y_long = 1.0 - np.divide(var_y_residuals_long, var_y)
diff --git a/doubleml/utils/tests/test_exceptions_gain_statistics.py b/doubleml/utils/tests/test_exceptions_gain_statistics.py
index c4f3d3af3..9f42063d6 100644
--- a/doubleml/utils/tests/test_exceptions_gain_statistics.py
+++ b/doubleml/utils/tests/test_exceptions_gain_statistics.py
@@ -4,9 +4,14 @@
 from doubleml.utils.gain_statistics import gain_statistics
 
 
+class test_framework():
+    def __init__(self, sensitivity_elements):
+        self.sensitivity_elements = sensitivity_elements
+
+
 class test_dml_class():
     def __init__(self, sensitivity_elements, all_coef):
-        self.sensitivity_elements = sensitivity_elements
+        self.framework = test_framework(sensitivity_elements)
         self.all_coef = all_coef
 
 
@@ -30,11 +35,11 @@ def test_doubleml_exception_data():
             sensitivity_elements=np.random.normal(size=(n_obs, n_rep, n_coef)),
             all_coef=np.random.normal(size=(n_rep, n_coef))
         )
-    msg = r"dml_long does not contain the necessary sensitivity elements\. Expected dict for dml_long\.sensitivity_elements\."
+    msg = r"dml_long does not contain the necessary sensitivity elements\. Expected dict for dml_long\.framework\.sensitivity_elements\."
     with pytest.raises(TypeError, match=msg):
         _ = gain_statistics(dml_incorrect, dml_correct)
     msg = r"dml_short does not contain the necessary sensitivity elements\. "
-    msg += r"Expected dict for dml_short\.sensitivity_elements\."
+    msg += r"Expected dict for dml_short\.framework\.sensitivity_elements\."
     with pytest.raises(TypeError, match=msg):
         _ = gain_statistics(dml_correct, dml_incorrect)
 

From fef69878cc972b4821ea09bf020d2f274645b373 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Thu, 1 Aug 2024 16:06:10 +0200
Subject: [PATCH 97/98] fix formatting

---
 doubleml/utils/tests/test_exceptions_gain_statistics.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doubleml/utils/tests/test_exceptions_gain_statistics.py b/doubleml/utils/tests/test_exceptions_gain_statistics.py
index 9f42063d6..734185eb4 100644
--- a/doubleml/utils/tests/test_exceptions_gain_statistics.py
+++ b/doubleml/utils/tests/test_exceptions_gain_statistics.py
@@ -35,7 +35,8 @@ def test_doubleml_exception_data():
             sensitivity_elements=np.random.normal(size=(n_obs, n_rep, n_coef)),
             all_coef=np.random.normal(size=(n_rep, n_coef))
         )
-    msg = r"dml_long does not contain the necessary sensitivity elements\. Expected dict for dml_long\.framework\.sensitivity_elements\."
+    msg = r"dml_long does not contain the necessary sensitivity elements\. "
+    msg += r"Expected dict for dml_long\.framework\.sensitivity_elements\."
     with pytest.raises(TypeError, match=msg):
         _ = gain_statistics(dml_incorrect, dml_correct)
     msg = r"dml_short does not contain the necessary sensitivity elements\. "

From 5200ce289de075049e16831d85697893f46c2681 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Thu, 1 Aug 2024 16:07:32 +0200
Subject: [PATCH 98/98] update make_irm_data_discrete_treatments descriptions

---
 doubleml/datasets.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/doubleml/datasets.py b/doubleml/datasets.py
index e2f2dcddf..b17c43f5f 100644
--- a/doubleml/datasets.py
+++ b/doubleml/datasets.py
@@ -1503,6 +1503,10 @@ def make_irm_data_discrete_treatments(n_obs=200, n_levels=3, linear=False, rando
         The number of treatment levels.
         Default is ``3``.
 
+    linear : bool
+        Indicates whether the true underlying regression is linear.
+        Default is ``False``.
+
     random_state : int
         Random seed for reproducibility.
         Default is ``42``.