Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
190 changes: 121 additions & 69 deletions doubleml/datasets.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pandas as pd
import numpy as np
import warnings

from scipy.linalg import toeplitz
from scipy.optimize import minimize_scalar
Expand Down Expand Up @@ -895,11 +896,11 @@ def f_ps(w, xi):
raise ValueError('Invalid return_type.')


def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04):
def make_confounded_irm_data(n_obs=500, theta=0.0, gamma_a=0.127, beta_a=0.58, linear=False, **kwargs):
"""
Generates counfounded data from an interactive regression model.

The data generating process is defined as follows (similar to the Monte Carlo simulation used
The data generating process is defined as follows (inspired by the Monte Carlo simulation used
in Sant'Anna and Zhao (2020)).

Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` corresponds
Expand All @@ -924,22 +925,30 @@ def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04):

.. math::

m(X, A) = P(D=1|X,A) = 0.5 + \\gamma_A \\cdot A
m(X, A) = P(D=1|X,A) = p(Z) + \\gamma_A \\cdot A

where

.. math::

p(Z) &= \\frac{\\exp(f_{ps}(Z))}{1 + \\exp(f_{ps}(Z))},

f_{ps}(Z) &= 0.75 \\cdot (-Z_1 + 0.1 \\cdot Z_2 -0.25 \\cdot Z_3 - 0.1 \\cdot Z_4).

and generate the treatment :math:`D = 1\\{m(X, A) \\ge U\\}` with :math:`U \\sim \\mathcal{U}[0, 1]`.
Since :math:`A` is independent of :math:`X`, the short form of the propensity score is given as

.. math::

P(D=1|X) = 0.5.
P(D=1|X) = p(Z).

Further, generate the outcome of interest :math:`Y` as

.. math::

Y &= \\theta \\cdot D (Z_5 + 1) + g(Z) + \\beta_A \\cdot A + \\varepsilon

g(Z) &= 210 + 27.4 \\cdot Z_1 +13.7 \\cdot (Z_2 + Z_3 + Z_4)
g(Z) &= 2.5 + 0.74 \\cdot Z_1 + 0.25 \\cdot Z_2 + 0.137 \\cdot (Z_3 + Z_4)

where :math:`\\varepsilon \\sim \\mathcal{N}(0,5)`.
This implies an average treatment effect of :math:`\\theta`. Additionally, the long and short forms of
Expand All @@ -952,13 +961,13 @@ def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04):
\\mathbb{E}[Y|D, X] &= (\\theta + \\beta_A \\frac{\\mathrm{Cov}(A, D(Z_5 + 1))}{\\mathrm{Var}(D(Z_5 + 1))})
\\cdot D (Z_5 + 1) + g(Z).

Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`.
Both are chosen to obtain the desired confounding of the outcome and Riesz Representer (in sample).
Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`, which can be
set via the parameters ``gamma_a`` and ``beta_a``.

The observed data is given as :math:`W = (Y, D, X)`.
The observed data is given as :math:`W = (Y, D, Z)`.
Further, orcale values of the confounder :math:`A`, the transformed covariated :math:`Z`,
the potential outcomes of :math:`Y`, the coefficients :math:`\\gamma_a`, :math:`\\beta_a`, the
long and short forms of the main regression and the propensity score
the potential outcomes of :math:`Y`, the long and short forms of the main regression and the propensity score and
in sample versions of the confounding parameters :math:`cf_d` and :math:`cf_y` (for ATE and ATTE)
are returned in a dictionary.

Parameters
Expand All @@ -968,13 +977,16 @@ def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04):
Default is ``500``.
theta : float or int
Average treatment effect.
Default is ``5.0``.
cf_y : float
Percentage of the residual variation of the outcome explained by latent/confounding variable.
Default is ``0.04``.
cf_d : float
Percentage gains in the variation of the Riesz Representer generated by latent/confounding variable.
Default is ``0.04``.
Default is ``0.0``.
gamma_a : float
Coefficient of the unobserved confounder in the propensity score.
Default is ``0.127``.
beta_a : float
Coefficient of the unobserved confounder in the outcome regression.
Default is ``0.58``.
linear : bool
If ``True``, the Z will be set to X, such that the underlying (short) models are linear/logistic.
Default is ``False``.

Returns
-------
Expand All @@ -988,82 +1000,122 @@ def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04):
doi:`10.1016/j.jeconom.2020.06.003 <https://doi.org/10.1016/j.jeconom.2020.06.003>`_.
"""
c = 0.0 # the confounding strength is only valid for c=0
dim_x = 5
xi = 0.75
dim_x = kwargs.get('dim_x', 5)
trimming_threshold = kwargs.get('trimming_threshold', 0.01)
var_eps_y = kwargs.get('var_eps_y', 1.0)

# Specification of main regression function
def f_reg(w):
res = 2.5 + 0.74*w[:, 0] + 0.25 * w[:, 1] + 0.137*(w[:, 2] + w[:, 3])
return res

# Specification of prop score function
def f_ps(w, xi):
res = xi*(-w[:, 0] + 0.1*w[:, 1] - 0.25*w[:, 2] - 0.1*w[:, 3])
return res
# observed covariates
cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)])
x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ])

z_tilde_1 = np.exp(0.5*x[:, 0])
z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0]))
z_tilde_3 = (0.6 + x[:, 0] * x[:, 2]/25)**3
z_tilde_4 = (20 + x[:, 1] + x[:, 3])**2

z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, x[:, 4:]))
z_tilde_5 = x[:, 4]
z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, z_tilde_5))
z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0)

# error terms and unobserved confounder
var_eps_y = 5
eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs)

# unobserved confounder
a_bounds = (-1, 1)
a = np.random.uniform(low=a_bounds[0], high=a_bounds[1], size=n_obs)
var_a = np.square(a_bounds[1] - a_bounds[0]) / 12

# get the required impact of the confounder on the propensity score
possible_coefs = np.arange(0.001, 0.4999, 0.001)
gamma_a = possible_coefs[(np.arctanh(2*possible_coefs) / (2*possible_coefs)) - 1 - cf_d/(1 - cf_d) >= 0][0]

# compute short and long form of riesz representer
m_long = 0.5 + gamma_a*a
m_short = 0.5 * np.ones_like(m_long)
# Choose the features used in the models
if linear:
features_ps = x
features_reg = x
else:
features_ps = z
features_reg = z

p = np.exp(f_ps(features_ps, xi)) / (1 + np.exp(f_ps(features_ps, xi)))
# compute short and long form of propensity score
m_long = p + gamma_a*a
m_short = p
# check propensity score bounds
if np.any(m_long < trimming_threshold) or np.any(m_long > 1.0 - trimming_threshold):
m_long = np.clip(m_long, trimming_threshold, 1.0 - trimming_threshold)
m_short = np.clip(m_short, trimming_threshold, 1.0 - trimming_threshold)
warnings.warn(f'Propensity score is close to 0 or 1. '
f'Trimming is at {trimming_threshold} and {1.0-trimming_threshold} is applied')
# generate treatment based on long form
u = np.random.uniform(low=0, high=1, size=n_obs)
d = 1.0 * (m_long >= u)

# short and long version of g
g_partial_reg = 210 + 27.4*z[:, 0] + 13.7*(z[:, 1] + z[:, 2] + z[:, 3])

dx = d * (x[:, 4] + 1)
d1x = x[:, 4] + 1
var_dx = np.var(dx)
cov_adx = np.cov(a, dx)[0, 1]

def f_g(beta_a):
g_diff = beta_a * (a - cov_adx / var_dx)
y_diff = eps_y + g_diff
return np.square(np.mean(np.square(g_diff)) / np.mean(np.square(y_diff)) - cf_y)
beta_a = minimize_scalar(f_g).x

# add treatment heterogeneity
d1x = z[:, 4] + 1
var_dx = np.var(d*(d1x))
cov_adx = gamma_a * var_a
# Outcome regression
g_partial_reg = f_reg(features_reg)
# short model
g_short_d0 = g_partial_reg
g_short_d1 = (theta + beta_a * cov_adx / var_dx) * d1x + g_partial_reg
g_short = d * g_short_d1 + (1.0-d) * g_short_d0

# long model
g_long_d0 = g_partial_reg + beta_a * a
g_long_d1 = theta * d1x + g_partial_reg + beta_a * a
g_long = d * g_long_d1 + (1.0-d) * g_long_d0

y0 = g_long_d0 + eps_y
y1 = g_long_d1 + eps_y

y = d * y1 + (1.0-d) * y0

oracle_values = {'g_long': g_long,
'g_short': g_short,
'm_long': m_long,
'm_short': m_short,
'gamma_a': gamma_a,
'beta_a': beta_a,
'a': a,
'y0': y0,
'y1': y1,
'z': z}

res_dict = {'x': x,
'y': y,
'd': d,
'oracle_values': oracle_values}

# Potential outcomes
y_0 = g_long_d0 + eps_y
y_1 = g_long_d1 + eps_y
# Realized outcome
y = d * y_1 + (1.0-d) * y_0
# In-sample values for confounding strength
explained_residual_variance = np.square(g_long - g_short)
residual_variance = np.square(y - g_short)
cf_y = np.mean(explained_residual_variance) / np.mean(residual_variance)
# compute the Riesz representation
treated_weight = d / np.mean(d)
untreated_weight = (1.0 - d) / np.mean(d)
# Odds ratios
propensity_ratio_long = m_long / (1.0 - m_long)
rr_long_ate = d / m_long - (1.0 - d) / (1.0 - m_long)
rr_long_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_long)
propensity_ratio_short = m_short / (1.0 - m_short)
rr_short_ate = d / m_short - (1.0 - d) / (1.0 - m_short)
rr_short_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_short)
cf_d_ate = (np.mean(1/(m_long * (1 - m_long))) - np.mean(1/(m_short * (1 - m_short)))) / np.mean(1/(m_long * (1 - m_long)))
cf_d_atte = (np.mean(propensity_ratio_long) - np.mean(propensity_ratio_short)) / np.mean(propensity_ratio_long)
if (beta_a == 0) | (gamma_a == 0):
rho_ate = 0.0
rho_atte = 0.0
else:
rho_ate = np.corrcoef((g_long - g_short), (rr_long_ate - rr_short_ate))[0, 1]
rho_atte = np.corrcoef((g_long - g_short), (rr_long_atte - rr_short_atte))[0, 1]
oracle_values = {
'g_long': g_long,
'g_short': g_short,
'm_long': m_long,
'm_short': m_short,
'gamma_a': gamma_a,
'beta_a': beta_a,
'a': a,
'y_0': y_0,
'y_1': y_1,
'z': z,
'cf_y': cf_y,
'cf_d_ate': cf_d_ate,
'cf_d_atte': cf_d_atte,
'rho_ate': rho_ate,
'rho_atte': rho_atte,
}
res_dict = {
'x': x,
'y': y,
'd': d,
'oracle_values': oracle_values
}
return res_dict


Expand Down
19 changes: 15 additions & 4 deletions doubleml/tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,10 +186,16 @@ def test_make_did_SZ2020_return_types(cross_sectional, dgp_type):
_ = make_did_SZ2020(n_obs=100, dgp_type="5", cross_sectional_data=cross_sectional, return_type='matrix')


@pytest.fixture(scope='function',
params=[True, False])
def linear(request):
return request.param


@pytest.mark.ci
def test_make_confounded_irm_data_return_types():
def test_make_confounded_irm_data_return_types(linear):
np.random.seed(3141)
res = make_confounded_irm_data()
res = make_confounded_irm_data(linear=linear)
assert isinstance(res, dict)
assert isinstance(res['x'], np.ndarray)
assert isinstance(res['y'], np.ndarray)
Expand All @@ -203,9 +209,14 @@ def test_make_confounded_irm_data_return_types():
assert isinstance(res['oracle_values']['gamma_a'], float)
assert isinstance(res['oracle_values']['beta_a'], float)
assert isinstance(res['oracle_values']['a'], np.ndarray)
assert isinstance(res['oracle_values']['y0'], np.ndarray)
assert isinstance(res['oracle_values']['y1'], np.ndarray)
assert isinstance(res['oracle_values']['y_0'], np.ndarray)
assert isinstance(res['oracle_values']['y_1'], np.ndarray)
assert isinstance(res['oracle_values']['z'], np.ndarray)
assert isinstance(res['oracle_values']['cf_y'], float)
assert isinstance(res['oracle_values']['cf_d_ate'], float)
assert isinstance(res['oracle_values']['cf_d_atte'], float)
assert isinstance(res['oracle_values']['rho_ate'], float)
assert isinstance(res['oracle_values']['rho_atte'], float)


@pytest.mark.ci
Expand Down