Merge pull request #271 from DoubleML/s-update-blp-cov-type

SvenKlaassen · web-flow · commit 0ee11dd7453d · 2024-10-10T16:00:35.000+02:00
Add cov_type and kwargs to BLP object
diff --git a/doubleml/irm/apo.py b/doubleml/irm/apo.py
@@ -389,7 +389,7 @@ def _check_data(self, obj_dml_data):
 
         return
 
-    def capo(self, basis, is_gate=False):
+    def capo(self, basis, is_gate=False, **kwargs):
         """
         Calculate conditional average potential outcomes (CAPO) for a given basis.
 
@@ -398,10 +398,14 @@ def capo(self, basis, is_gate=False):
         basis : :class:`pandas.DataFrame`
             The basis for estimating the best linear predictor. Has to have the shape ``(n_obs, d)``,
             where ``n_obs`` is the number of observations and ``d`` is the number of predictors.
+
         is_gate : bool
             Indicates whether the basis is constructed for GATE/GAPOs (dummy-basis).
             Default is ``False``.
 
+        **kwargs: dict
+            Additional keyword arguments to be passed to :meth:`statsmodels.regression.linear_model.OLS.fit` e.g. ``cov_type``.
+
         Returns
         -------
         model : :class:`doubleML.DoubleMLBLP`
@@ -420,10 +424,10 @@ def capo(self, basis, is_gate=False):
         orth_signal = self.psi_elements['psi_b'].reshape(-1)
         # fit the best linear predictor
         model = DoubleMLBLP(orth_signal, basis=basis, is_gate=is_gate)
-        model.fit()
+        model.fit(**kwargs)
         return model
 
-    def gapo(self, groups):
+    def gapo(self, groups, **kwargs):
         """
         Calculate group average potential outcomes (GAPO) for groups.
 
@@ -434,6 +438,9 @@ def gapo(self, groups):
             Has to be dummy coded with shape ``(n_obs, d)``, where ``n_obs`` is the number of observations
             and ``d`` is the number of groups or ``(n_obs, 1)`` and contain the corresponding groups (as str).
 
+        **kwargs: dict
+            Additional keyword arguments to be passed to :meth:`statsmodels.regression.linear_model.OLS.fit` e.g. ``cov_type``.
+
         Returns
         -------
         model : :class:`doubleML.DoubleMLBLP`
@@ -453,5 +460,5 @@ def gapo(self, groups):
         if any(groups.sum(0) <= 5):
             warnings.warn('At least one group effect is estimated with less than 6 observations.')
 
-        model = self.capo(groups, is_gate=True)
+        model = self.capo(groups, is_gate=True, **kwargs)
         return model
diff --git a/doubleml/irm/irm.py b/doubleml/irm/irm.py
@@ -431,7 +431,7 @@ def _nuisance_tuning(self, smpls, param_grids, scoring_methods, n_folds_tune, n_
 
         return res
 
-    def cate(self, basis, is_gate=False):
+    def cate(self, basis, is_gate=False, **kwargs):
         """
         Calculate conditional average treatment effects (CATE) for a given basis.
 
@@ -440,10 +440,14 @@ def cate(self, basis, is_gate=False):
         basis : :class:`pandas.DataFrame`
             The basis for estimating the best linear predictor. Has to have the shape ``(n_obs, d)``,
             where ``n_obs`` is the number of observations and ``d`` is the number of predictors.
+
         is_gate : bool
             Indicates whether the basis is constructed for GATEs (dummy-basis).
             Default is ``False``.
 
+        **kwargs: dict
+            Additional keyword arguments to be passed to :meth:`statsmodels.regression.linear_model.OLS.fit` e.g. ``cov_type``.
+
         Returns
         -------
         model : :class:`doubleML.DoubleMLBLP`
@@ -462,10 +466,10 @@ def cate(self, basis, is_gate=False):
         orth_signal = self.psi_elements['psi_b'].reshape(-1)
         # fit the best linear predictor
         model = DoubleMLBLP(orth_signal, basis=basis, is_gate=is_gate)
-        model.fit()
+        model.fit(**kwargs)
         return model
 
-    def gate(self, groups):
+    def gate(self, groups, **kwargs):
         """
         Calculate group average treatment effects (GATE) for groups.
 
@@ -476,6 +480,9 @@ def gate(self, groups):
             Has to be dummy coded with shape ``(n_obs, d)``, where ``n_obs`` is the number of observations
             and ``d`` is the number of groups or ``(n_obs, 1)`` and contain the corresponding groups (as str).
 
+        **kwargs: dict
+            Additional keyword arguments to be passed to :meth:`statsmodels.regression.linear_model.OLS.fit` e.g. ``cov_type``.
+
         Returns
         -------
         model : :class:`doubleML.DoubleMLBLP`
@@ -495,7 +502,7 @@ def gate(self, groups):
         if any(groups.sum(0) <= 5):
             warnings.warn('At least one group effect is estimated with less than 6 observations.')
 
-        model = self.cate(groups, is_gate=True)
+        model = self.cate(groups, is_gate=True, **kwargs)
         return model
 
     def policy_tree(self, features, depth=2, **tree_params):
diff --git a/doubleml/irm/tests/test_apo.py b/doubleml/irm/tests/test_apo.py
@@ -200,8 +200,14 @@ def test_dml_apo_sensitivity(dml_apo_fixture):
                            rtol=1e-9, atol=1e-4)
 
 
+@pytest.fixture(scope='module',
+                params=["nonrobust", "HC0", "HC1", "HC2", "HC3"])
+def cov_type(request):
+    return request.param
+
+
 @pytest.mark.ci
-def test_dml_apo_capo_gapo(treatment_level):
+def test_dml_apo_capo_gapo(treatment_level, cov_type):
     n = 20
     # collect data
     np.random.seed(42)
@@ -221,25 +227,28 @@ def test_dml_apo_capo_gapo(treatment_level):
     dml_obj.fit()
     # create a random basis
     random_basis = pd.DataFrame(np.random.normal(0, 1, size=(n, 5)))
-    capo = dml_obj.capo(random_basis)
+    capo = dml_obj.capo(random_basis, cov_type=cov_type)
     assert isinstance(capo, dml.utils.blp.DoubleMLBLP)
     assert isinstance(capo.confint(), pd.DataFrame)
+    assert capo.blp_model.cov_type == cov_type
 
     groups_1 = pd.DataFrame(np.column_stack([obj_dml_data.data['X1'] <= -1.0,
                                              obj_dml_data.data['X1'] > 0.2]),
                             columns=['Group 1', 'Group 2'])
     msg = ('At least one group effect is estimated with less than 6 observations.')
     with pytest.warns(UserWarning, match=msg):
-        gapo_1 = dml_obj.gapo(groups_1)
+        gapo_1 = dml_obj.gapo(groups_1, cov_type=cov_type)
     assert isinstance(gapo_1, dml.utils.blp.DoubleMLBLP)
     assert isinstance(gapo_1.confint(), pd.DataFrame)
     assert all(gapo_1.confint().index == groups_1.columns.to_list())
+    assert gapo_1.blp_model.cov_type == cov_type
 
     np.random.seed(42)
     groups_2 = pd.DataFrame(np.random.choice(["1", "2"], n, p=[0.1, 0.9]))
     msg = ('At least one group effect is estimated with less than 6 observations.')
     with pytest.warns(UserWarning, match=msg):
-        gapo_2 = dml_obj.gapo(groups_2)
+        gapo_2 = dml_obj.gapo(groups_2, cov_type=cov_type)
     assert isinstance(gapo_2, dml.utils.blp.DoubleMLBLP)
     assert isinstance(gapo_2.confint(), pd.DataFrame)
     assert all(gapo_2.confint().index == ["Group_1", "Group_2"])
+    assert gapo_2.blp_model.cov_type == cov_type
diff --git a/doubleml/irm/tests/test_irm.py b/doubleml/irm/tests/test_irm.py
@@ -187,8 +187,14 @@ def test_dml_irm_sensitivity_rho0(dml_irm_fixture):
                        rtol=1e-9, atol=1e-4)
 
 
+@pytest.fixture(scope='module',
+                params=["nonrobust", "HC0", "HC1", "HC2", "HC3"])
+def cov_type(request):
+    return request.param
+
+
 @pytest.mark.ci
-def test_dml_irm_cate_gate():
+def test_dml_irm_cate_gate(cov_type):
     n = 9
     # collect data
     np.random.seed(42)
@@ -207,28 +213,31 @@ def test_dml_irm_cate_gate():
     dml_irm_obj.fit()
     # create a random basis
     random_basis = pd.DataFrame(np.random.normal(0, 1, size=(n, 5)))
-    cate = dml_irm_obj.cate(random_basis)
+    cate = dml_irm_obj.cate(random_basis, cov_type=cov_type)
     assert isinstance(cate, dml.utils.blp.DoubleMLBLP)
     assert isinstance(cate.confint(), pd.DataFrame)
+    assert cate.blp_model.cov_type == cov_type
 
     groups_1 = pd.DataFrame(np.column_stack([obj_dml_data.data['X1'] <= 0,
                                              obj_dml_data.data['X1'] > 0.2]),
                             columns=['Group 1', 'Group 2'])
     msg = ('At least one group effect is estimated with less than 6 observations.')
     with pytest.warns(UserWarning, match=msg):
-        gate_1 = dml_irm_obj.gate(groups_1)
+        gate_1 = dml_irm_obj.gate(groups_1, cov_type=cov_type)
     assert isinstance(gate_1, dml.utils.blp.DoubleMLBLP)
     assert isinstance(gate_1.confint(), pd.DataFrame)
     assert all(gate_1.confint().index == groups_1.columns.to_list())
+    assert gate_1.blp_model.cov_type == cov_type
 
     np.random.seed(42)
     groups_2 = pd.DataFrame(np.random.choice(["1", "2"], n))
     msg = ('At least one group effect is estimated with less than 6 observations.')
     with pytest.warns(UserWarning, match=msg):
-        gate_2 = dml_irm_obj.gate(groups_2)
+        gate_2 = dml_irm_obj.gate(groups_2, cov_type=cov_type)
     assert isinstance(gate_2, dml.utils.blp.DoubleMLBLP)
     assert isinstance(gate_2.confint(), pd.DataFrame)
     assert all(gate_2.confint().index == ["Group_1", "Group_2"])
+    assert gate_2.blp_model.cov_type == cov_type
 
 
 @pytest.fixture(scope='module',
diff --git a/doubleml/plm/plr.py b/doubleml/plm/plr.py
@@ -341,7 +341,7 @@ def _nuisance_tuning(self, smpls, param_grids, scoring_methods, n_folds_tune, n_
 
         return res
 
-    def cate(self, basis, is_gate=False):
+    def cate(self, basis, is_gate=False, **kwargs):
         """
         Calculate conditional average treatment effects (CATE) for a given basis.
 
@@ -350,10 +350,14 @@ def cate(self, basis, is_gate=False):
         basis : :class:`pandas.DataFrame`
             The basis for estimating the best linear predictor. Has to have the shape ``(n_obs, d)``,
             where ``n_obs`` is the number of observations and ``d`` is the number of predictors.
+
         is_gate : bool
             Indicates whether the basis is constructed for GATEs (dummy-basis).
             Default is ``False``.
 
+        **kwargs: dict
+            Additional keyword arguments to be passed to :meth:`statsmodels.regression.linear_model.OLS.fit` e.g. ``cov_type``.
+
         Returns
         -------
         model : :class:`doubleML.DoubleMLBLP`
@@ -374,10 +378,10 @@ def cate(self, basis, is_gate=False):
             basis=D_basis,
             is_gate=is_gate,
         )
-        model.fit()
+        model.fit(**kwargs)
         return model
 
-    def gate(self, groups):
+    def gate(self, groups, **kwargs):
         """
         Calculate group average treatment effects (GATE) for groups.
 
@@ -388,6 +392,9 @@ def gate(self, groups):
             Has to be dummy coded with shape ``(n_obs, d)``, where ``n_obs`` is the number of observations
             and ``d`` is the number of groups or ``(n_obs, 1)`` and contain the corresponding groups (as str).
 
+        **kwargs: dict
+            Additional keyword arguments to be passed to :meth:`statsmodels.regression.linear_model.OLS.fit` e.g. ``cov_type``.
+
         Returns
         -------
         model : :class:`doubleML.DoubleMLBLP`
@@ -407,7 +414,7 @@ def gate(self, groups):
         if any(groups.sum(0) <= 5):
             warnings.warn('At least one group effect is estimated with less than 6 observations.')
 
-        model = self.cate(groups, is_gate=True)
+        model = self.cate(groups, is_gate=True, **kwargs)
         return model
 
     def _partial_out(self):
diff --git a/doubleml/plm/tests/test_plr.py b/doubleml/plm/tests/test_plr.py
@@ -301,8 +301,14 @@ def test_dml_plr_ols_manual_boot(dml_plr_ols_manual_fixture):
                            rtol=1e-9, atol=1e-4)
 
 
+@pytest.fixture(scope='module',
+                params=["nonrobust", "HC0", "HC1", "HC2", "HC3"])
+def cov_type(request):
+    return request.param
+
+
 @pytest.mark.ci
-def test_dml_plr_cate_gate(score):
+def test_dml_plr_cate_gate(score, cov_type):
     n = 9
 
     # collect data
@@ -318,26 +324,29 @@ def test_dml_plr_cate_gate(score):
                                   score=score)
     dml_plr_obj.fit()
     random_basis = pd.DataFrame(np.random.normal(0, 1, size=(n, 5)))
-    cate = dml_plr_obj.cate(random_basis)
+    cate = dml_plr_obj.cate(random_basis, cov_type=cov_type)
     assert isinstance(cate, dml.DoubleMLBLP)
     assert isinstance(cate.confint(), pd.DataFrame)
+    assert cate.blp_model.cov_type == cov_type
 
     groups_1 = pd.DataFrame(
         np.column_stack([obj_dml_data.data['X1'] <= 0,
                          obj_dml_data.data['X1'] > 0.2]),
         columns=['Group 1', 'Group 2'])
     msg = ('At least one group effect is estimated with less than 6 observations.')
     with pytest.warns(UserWarning, match=msg):
-        gate_1 = dml_plr_obj.gate(groups_1)
+        gate_1 = dml_plr_obj.gate(groups_1, cov_type=cov_type)
     assert isinstance(gate_1, dml.utils.blp.DoubleMLBLP)
     assert isinstance(gate_1.confint(), pd.DataFrame)
     assert all(gate_1.confint().index == groups_1.columns.tolist())
+    assert gate_1.blp_model.cov_type == cov_type
 
     np.random.seed(42)
     groups_2 = pd.DataFrame(np.random.choice(["1", "2"], n))
     msg = ('At least one group effect is estimated with less than 6 observations.')
     with pytest.warns(UserWarning, match=msg):
-        gate_2 = dml_plr_obj.gate(groups_2)
+        gate_2 = dml_plr_obj.gate(groups_2, cov_type=cov_type)
     assert isinstance(gate_2, dml.utils.blp.DoubleMLBLP)
     assert isinstance(gate_2.confint(), pd.DataFrame)
     assert all(gate_2.confint().index == ["Group_1", "Group_2"])
+    assert gate_2.blp_model.cov_type == cov_type
diff --git a/doubleml/utils/blp.py b/doubleml/utils/blp.py
@@ -110,18 +110,27 @@ def summary(self):
                                       columns=col_names)
         return df_summary
 
-    def fit(self):
+    def fit(self, cov_type='HC0', **kwargs):
         """
         Estimate DoubleMLBLP models.
 
+        Parameters
+        ----------
+        cov_type : str
+            The covariance type to be used in the estimation. Default is ``'HC0'``.
+            See :meth:`statsmodels.regression.linear_model.OLS.fit` for more information.
+
+        **kwargs: dict
+            Additional keyword arguments to be passed to :meth:`statsmodels.regression.linear_model.OLS.fit`.
+
         Returns
         -------
         self : object
         """
 
         # fit the best-linear-predictor of the orthogonal signal with respect to the grid
-        self._blp_model = sm.OLS(self._orth_signal, self._basis).fit()
-        self._blp_omega = self._blp_model.cov_HC0
+        self._blp_model = sm.OLS(self._orth_signal, self._basis).fit(cov_type=cov_type, **kwargs)
+        self._blp_omega = self._blp_model.cov_params().to_numpy()
 
         return self
 
diff --git a/doubleml/utils/tests/_utils_blp_manual.py b/doubleml/utils/tests/_utils_blp_manual.py
@@ -5,8 +5,8 @@
 import pandas as pd
 
 
-def fit_blp(orth_signal, basis):
-    blp_model = sm.OLS(orth_signal, basis).fit()
+def fit_blp(orth_signal, basis, cov_type, **kwargs):
+    blp_model = sm.OLS(orth_signal, basis).fit(cov_type=cov_type, **kwargs)
 
     return blp_model
 
@@ -15,7 +15,7 @@ def blp_confint(blp_model, basis, joint=False, level=0.95, n_rep_boot=500):
     alpha = 1 - level
     g_hat = blp_model.predict(basis)
 
-    blp_omega = blp_model.cov_HC0
+    blp_omega = blp_model.cov_params().to_numpy()
 
     blp_se = np.sqrt((basis.dot(blp_omega) * basis).sum(axis=1))
 
diff --git a/doubleml/utils/tests/test_blp.py b/doubleml/utils/tests/test_blp.py