DoubleML · SvenKlaassen · Aug 2, 2024 · Aug 2, 2024 · Aug 2, 2024 · Aug 2, 2024
diff --git a/doubleml/datasets.py b/doubleml/datasets.py
@@ -1,5 +1,6 @@
 import pandas as pd
 import numpy as np
+import warnings
 
 from scipy.linalg import toeplitz
 from scipy.optimize import minimize_scalar
@@ -895,11 +896,11 @@ def f_ps(w, xi):
             raise ValueError('Invalid return_type.')
 
 
-def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04):
+def make_confounded_irm_data(n_obs=500, theta=0.0, gamma_a=0.127, beta_a=0.58, linear=False, **kwargs):
     """
     Generates counfounded data from an interactive regression model.
 
-    The data generating process is defined as follows (similar to the Monte Carlo simulation used
+    The data generating process is defined as follows (inspired by the Monte Carlo simulation used
     in Sant'Anna and Zhao (2020)).
 
     Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, where  :math:`\\Sigma` corresponds
@@ -924,22 +925,30 @@ def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04):
 
     .. math::
 
-        m(X, A) = P(D=1|X,A) = 0.5 + \\gamma_A \\cdot A
+        m(X, A) = P(D=1|X,A) = p(Z) + \\gamma_A \\cdot A
+
+    where
+
+    .. math::
+
+        p(Z) &= \\frac{\\exp(f_{ps}(Z))}{1 + \\exp(f_{ps}(Z))},
+
+        f_{ps}(Z) &= 0.75 \\cdot (-Z_1 + 0.1 \\cdot Z_2 -0.25 \\cdot Z_3 - 0.1 \\cdot Z_4).
 
     and generate the treatment :math:`D = 1\\{m(X, A) \\ge U\\}` with :math:`U \\sim \\mathcal{U}[0, 1]`.
     Since :math:`A` is independent of :math:`X`, the short form of the propensity score is given as
 
     .. math::
 
-        P(D=1|X) = 0.5.
+        P(D=1|X) = p(Z).
 
     Further, generate the outcome of interest :math:`Y` as
 
     .. math::
 
         Y &= \\theta \\cdot D (Z_5 + 1) + g(Z) + \\beta_A \\cdot A + \\varepsilon
 
-        g(Z) &= 210 + 27.4 \\cdot Z_1 +13.7 \\cdot (Z_2 + Z_3 + Z_4)
+        g(Z) &= 2.5 + 0.74 \\cdot Z_1 + 0.25 \\cdot Z_2 + 0.137 \\cdot (Z_3 + Z_4)
 
     where :math:`\\varepsilon \\sim \\mathcal{N}(0,5)`.
     This implies an average treatment effect of :math:`\\theta`. Additionally, the long and short forms of
@@ -952,13 +961,13 @@ def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04):
         \\mathbb{E}[Y|D, X] &= (\\theta + \\beta_A \\frac{\\mathrm{Cov}(A, D(Z_5 + 1))}{\\mathrm{Var}(D(Z_5 + 1))})
             \\cdot D (Z_5 + 1) + g(Z).
 
-    Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`.
-    Both are chosen to obtain the desired confounding of the outcome and Riesz Representer (in sample).
+    Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`, which can be
+    set via the parameters ``gamma_a`` and ``beta_a``.
 
-    The observed data is given as :math:`W = (Y, D, X)`.
+    The observed data is given as :math:`W = (Y, D, Z)`.
     Further, orcale values of the confounder :math:`A`, the transformed covariated :math:`Z`,
-    the potential outcomes of :math:`Y`, the coefficients :math:`\\gamma_a`, :math:`\\beta_a`, the
-    long and short forms of the main regression and the propensity score
+    the potential outcomes of :math:`Y`, the long and short forms of the main regression and the propensity score and
+    in sample versions of the confounding parameters :math:`cf_d` and :math:`cf_y` (for ATE and ATTE)
     are returned in a dictionary.
 
     Parameters
@@ -968,13 +977,16 @@ def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04):
         Default is ``500``.
     theta : float or int
         Average treatment effect.
-        Default is ``5.0``.
-    cf_y : float
-        Percentage of the residual variation of the outcome explained by latent/confounding variable.
-        Default is ``0.04``.
-    cf_d : float
-        Percentage gains in the variation of the Riesz Representer generated by latent/confounding variable.
-        Default is ``0.04``.
+        Default is ``0.0``.
+    gamma_a : float
+        Coefficient of the unobserved confounder in the propensity score.
+        Default is ``0.127``.
+    beta_a : float
+        Coefficient of the unobserved confounder in the outcome regression.
+        Default is ``0.58``.
+    linear : bool
+        If ``True``, the Z will be set to X, such that the underlying (short) models are linear/logistic.
+        Default is ``False``.
 
     Returns
     -------
@@ -988,82 +1000,122 @@ def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04):
     doi:`10.1016/j.jeconom.2020.06.003 <https://doi.org/10.1016/j.jeconom.2020.06.003>`_.
     """
     c = 0.0  # the confounding strength is only valid for c=0
-    dim_x = 5
+    xi = 0.75
+    dim_x = kwargs.get('dim_x', 5)
+    trimming_threshold = kwargs.get('trimming_threshold', 0.01)
+    var_eps_y = kwargs.get('var_eps_y', 1.0)
+
+    # Specification of main regression function
+    def f_reg(w):
+        res = 2.5 + 0.74*w[:, 0] + 0.25 * w[:, 1] + 0.137*(w[:, 2] + w[:, 3])
+        return res
 
+    # Specification of prop score function
+    def f_ps(w, xi):
+        res = xi*(-w[:, 0] + 0.1*w[:, 1] - 0.25*w[:, 2] - 0.1*w[:, 3])
+        return res
     # observed covariates
     cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)])
     x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ])
-
     z_tilde_1 = np.exp(0.5*x[:, 0])
     z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0]))
     z_tilde_3 = (0.6 + x[:, 0] * x[:, 2]/25)**3
     z_tilde_4 = (20 + x[:, 1] + x[:, 3])**2
-
-    z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, x[:, 4:]))
+    z_tilde_5 = x[:, 4]
+    z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, z_tilde_5))
     z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0)
-
     # error terms and unobserved confounder
-    var_eps_y = 5
     eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs)
-
     # unobserved confounder
     a_bounds = (-1, 1)
     a = np.random.uniform(low=a_bounds[0], high=a_bounds[1], size=n_obs)
+    var_a = np.square(a_bounds[1] - a_bounds[0]) / 12
 
-    # get the required impact of the confounder on the propensity score
-    possible_coefs = np.arange(0.001, 0.4999, 0.001)
-    gamma_a = possible_coefs[(np.arctanh(2*possible_coefs) / (2*possible_coefs)) - 1 - cf_d/(1 - cf_d) >= 0][0]
-
-    # compute short and long form of riesz representer
-    m_long = 0.5 + gamma_a*a
-    m_short = 0.5 * np.ones_like(m_long)
+    # Choose the features used in the models
+    if linear:
+        features_ps = x
+        features_reg = x
+    else:
+        features_ps = z
+        features_reg = z
 
+    p = np.exp(f_ps(features_ps, xi)) / (1 + np.exp(f_ps(features_ps, xi)))
+    # compute short and long form of propensity score
+    m_long = p + gamma_a*a
+    m_short = p
+    # check propensity score bounds
+    if np.any(m_long < trimming_threshold) or np.any(m_long > 1.0 - trimming_threshold):
+        m_long = np.clip(m_long, trimming_threshold, 1.0 - trimming_threshold)
+        m_short = np.clip(m_short, trimming_threshold, 1.0 - trimming_threshold)
+        warnings.warn(f'Propensity score is close to 0 or 1. '
+                      f'Trimming is at {trimming_threshold} and {1.0-trimming_threshold} is applied')
+    # generate treatment based on long form
     u = np.random.uniform(low=0, high=1, size=n_obs)
     d = 1.0 * (m_long >= u)
-
-    # short and long version of g
-    g_partial_reg = 210 + 27.4*z[:, 0] + 13.7*(z[:, 1] + z[:, 2] + z[:, 3])
-
-    dx = d * (x[:, 4] + 1)
-    d1x = x[:, 4] + 1
-    var_dx = np.var(dx)
-    cov_adx = np.cov(a, dx)[0, 1]
-
-    def f_g(beta_a):
-        g_diff = beta_a * (a - cov_adx / var_dx)
-        y_diff = eps_y + g_diff
-        return np.square(np.mean(np.square(g_diff)) / np.mean(np.square(y_diff)) - cf_y)
-    beta_a = minimize_scalar(f_g).x
-
+    # add treatment heterogeneity
+    d1x = z[:, 4] + 1
+    var_dx = np.var(d*(d1x))
+    cov_adx = gamma_a * var_a
+    # Outcome regression
+    g_partial_reg = f_reg(features_reg)
+    # short model
     g_short_d0 = g_partial_reg
     g_short_d1 = (theta + beta_a * cov_adx / var_dx) * d1x + g_partial_reg
     g_short = d * g_short_d1 + (1.0-d) * g_short_d0
-
+    # long model
     g_long_d0 = g_partial_reg + beta_a * a
     g_long_d1 = theta * d1x + g_partial_reg + beta_a * a
     g_long = d * g_long_d1 + (1.0-d) * g_long_d0
-
-    y0 = g_long_d0 + eps_y
-    y1 = g_long_d1 + eps_y
-
-    y = d * y1 + (1.0-d) * y0
-
-    oracle_values = {'g_long': g_long,
-                     'g_short': g_short,
-                     'm_long': m_long,
-                     'm_short': m_short,
-                     'gamma_a': gamma_a,
-                     'beta_a': beta_a,
-                     'a': a,
-                     'y0': y0,
-                     'y1': y1,
-                     'z': z}
-
-    res_dict = {'x': x,
-                'y': y,
-                'd': d,
-                'oracle_values': oracle_values}
-
+    # Potential outcomes
+    y_0 = g_long_d0 + eps_y
+    y_1 = g_long_d1 + eps_y
+    # Realized outcome
+    y = d * y_1 + (1.0-d) * y_0
+    # In-sample values for confounding strength
+    explained_residual_variance = np.square(g_long - g_short)
+    residual_variance = np.square(y - g_short)
+    cf_y = np.mean(explained_residual_variance) / np.mean(residual_variance)
+    # compute the Riesz representation
+    treated_weight = d / np.mean(d)
+    untreated_weight = (1.0 - d) / np.mean(d)
+    # Odds ratios
+    propensity_ratio_long = m_long / (1.0 - m_long)
+    rr_long_ate = d / m_long - (1.0 - d) / (1.0 - m_long)
+    rr_long_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_long)
+    propensity_ratio_short = m_short / (1.0 - m_short)
+    rr_short_ate = d / m_short - (1.0 - d) / (1.0 - m_short)
+    rr_short_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_short)
+    cf_d_ate = (np.mean(1/(m_long * (1 - m_long))) - np.mean(1/(m_short * (1 - m_short)))) / np.mean(1/(m_long * (1 - m_long)))
+    cf_d_atte = (np.mean(propensity_ratio_long) - np.mean(propensity_ratio_short)) / np.mean(propensity_ratio_long)
+    if (beta_a == 0) | (gamma_a == 0):
+        rho_ate = 0.0
+        rho_atte = 0.0
+    else:
+        rho_ate = np.corrcoef((g_long - g_short), (rr_long_ate - rr_short_ate))[0, 1]
+        rho_atte = np.corrcoef((g_long - g_short), (rr_long_atte - rr_short_atte))[0, 1]
+    oracle_values = {
+        'g_long': g_long,
+        'g_short': g_short,
+        'm_long': m_long,
+        'm_short': m_short,
+        'gamma_a': gamma_a,
+        'beta_a': beta_a,
+        'a': a,
+        'y_0': y_0,
+        'y_1': y_1,
+        'z': z,
+        'cf_y': cf_y,
+        'cf_d_ate': cf_d_ate,
+        'cf_d_atte': cf_d_atte,
+        'rho_ate': rho_ate,
+        'rho_atte': rho_atte,
+    }
+    res_dict = {
+        'x': x,
+        'y': y,
+        'd': d,
+        'oracle_values': oracle_values
+    }
     return res_dict
 
 

diff --git a/doubleml/tests/test_datasets.py b/doubleml/tests/test_datasets.py
@@ -186,10 +186,16 @@ def test_make_did_SZ2020_return_types(cross_sectional, dgp_type):
         _ = make_did_SZ2020(n_obs=100, dgp_type="5", cross_sectional_data=cross_sectional, return_type='matrix')
 
 
+@pytest.fixture(scope='function',
+                params=[True, False])
+def linear(request):
+    return request.param
+
+
 @pytest.mark.ci
-def test_make_confounded_irm_data_return_types():
+def test_make_confounded_irm_data_return_types(linear):
     np.random.seed(3141)
-    res = make_confounded_irm_data()
+    res = make_confounded_irm_data(linear=linear)
     assert isinstance(res, dict)
     assert isinstance(res['x'], np.ndarray)
     assert isinstance(res['y'], np.ndarray)
@@ -203,9 +209,14 @@ def test_make_confounded_irm_data_return_types():
     assert isinstance(res['oracle_values']['gamma_a'], float)
     assert isinstance(res['oracle_values']['beta_a'], float)
     assert isinstance(res['oracle_values']['a'], np.ndarray)
-    assert isinstance(res['oracle_values']['y0'], np.ndarray)
-    assert isinstance(res['oracle_values']['y1'], np.ndarray)
+    assert isinstance(res['oracle_values']['y_0'], np.ndarray)
+    assert isinstance(res['oracle_values']['y_1'], np.ndarray)
     assert isinstance(res['oracle_values']['z'], np.ndarray)
+    assert isinstance(res['oracle_values']['cf_y'], float)
+    assert isinstance(res['oracle_values']['cf_d_ate'], float)
+    assert isinstance(res['oracle_values']['cf_d_atte'], float)
+    assert isinstance(res['oracle_values']['rho_ate'], float)
+    assert isinstance(res['oracle_values']['rho_atte'], float)
 
 
 @pytest.mark.ci