diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index baa6d6250..3e5321ea7 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -23,12 +23,10 @@ body: attributes: label: Minimum reproducible code snippet description: | - Please provide a short reproducible code snippet. Example: - - ```python + Please provide a short reproducible code snippet. Example: ```python import numpy as np import doubleml as dml - from doubleml.datasets import make_plr_CCDDHNR2018 + from doubleml.plm.datasets import make_plr_CCDDHNR2018 from sklearn.ensemble import RandomForestRegressor from sklearn.base import clone np.random.seed(3141) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4809c62a6..a614dd73b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -15,7 +15,7 @@ To submit a **bug report**, you can use our ```python import numpy as np import doubleml as dml -from doubleml.datasets import make_plr_CCDDHNR2018 +from doubleml.plm.datasets import make_plr_CCDDHNR2018 from sklearn.ensemble import RandomForestRegressor from sklearn.base import clone np.random.seed(3141) diff --git a/doubleml/data/base_data.py b/doubleml/data/base_data.py index 318508e98..7a1142207 100644 --- a/doubleml/data/base_data.py +++ b/doubleml/data/base_data.py @@ -135,9 +135,8 @@ class DoubleMLData(DoubleMLBaseData): Default is ``True``. Examples - -------- - >>> from doubleml import DoubleMLData - >>> from doubleml.datasets import make_plr_CCDDHNR2018 + -------- >>> from doubleml import DoubleMLData + >>> from doubleml.plm.datasets import make_plr_CCDDHNR2018 >>> # initialization from pandas.DataFrame >>> df = make_plr_CCDDHNR2018(return_type='DataFrame') >>> obj_dml_data_from_df = DoubleMLData(df, 'y', 'd') @@ -266,9 +265,8 @@ def from_arrays( Default is ``True``. Examples - -------- - >>> from doubleml import DoubleMLData - >>> from doubleml.datasets import make_plr_CCDDHNR2018 + -------- >>> from doubleml import DoubleMLData + >>> from doubleml.plm.datasets import make_plr_CCDDHNR2018 >>> (x, y, d) = make_plr_CCDDHNR2018(return_type='array') >>> obj_dml_data_from_array = DoubleMLData.from_arrays(x, y, d) """ diff --git a/doubleml/data/cluster_data.py b/doubleml/data/cluster_data.py index 658ab0cca..89947b736 100644 --- a/doubleml/data/cluster_data.py +++ b/doubleml/data/cluster_data.py @@ -61,9 +61,8 @@ class DoubleMLClusterData(DoubleMLData): Default is ``True``. Examples - -------- - >>> from doubleml import DoubleMLClusterData - >>> from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021 + -------- >>> from doubleml import DoubleMLClusterData + >>> from doubleml.plm.datasets import make_pliv_multiway_cluster_CKMS2021 >>> # initialization from pandas.DataFrame >>> df = make_pliv_multiway_cluster_CKMS2021(return_type='DataFrame') >>> obj_dml_data_from_df = DoubleMLClusterData(df, 'Y', 'D', ['cluster_var_i', 'cluster_var_j'], z_cols='Z') @@ -172,9 +171,8 @@ def from_arrays( Default is ``True``. Examples - -------- - >>> from doubleml import DoubleMLClusterData - >>> from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021 + -------- >>> from doubleml import DoubleMLClusterData + >>> from doubleml.plm.datasets import make_pliv_multiway_cluster_CKMS2021 >>> (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(return_type='array') >>> obj_dml_data_from_array = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars, z) """ diff --git a/doubleml/data/tests/conftest.py b/doubleml/data/tests/conftest.py index 6960b58a7..fcefabce5 100644 --- a/doubleml/data/tests/conftest.py +++ b/doubleml/data/tests/conftest.py @@ -2,7 +2,8 @@ import pandas as pd import pytest -from doubleml.datasets import make_irm_data, make_plr_turrell2018 +from doubleml.irm.datasets import make_irm_data +from doubleml.plm.datasets import make_plr_turrell2018 @pytest.fixture(scope="session", params=[(500, 10), (1000, 20), (1000, 100)]) diff --git a/doubleml/data/tests/test_cluster_data.py b/doubleml/data/tests/test_cluster_data.py index e95dfa033..b02a32752 100644 --- a/doubleml/data/tests/test_cluster_data.py +++ b/doubleml/data/tests/test_cluster_data.py @@ -3,7 +3,7 @@ import pytest from doubleml import DoubleMLClusterData -from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021, make_plr_CCDDHNR2018 +from doubleml.plm.datasets import make_pliv_multiway_cluster_CKMS2021, make_plr_CCDDHNR2018 @pytest.mark.ci diff --git a/doubleml/data/tests/test_dml_data.py b/doubleml/data/tests/test_dml_data.py index 7cf394b54..a2ada74bb 100644 --- a/doubleml/data/tests/test_dml_data.py +++ b/doubleml/data/tests/test_dml_data.py @@ -5,12 +5,12 @@ from doubleml import DoubleMLData, DoubleMLDIDCS, DoubleMLPLR, DoubleMLSSM from doubleml.data.base_data import DoubleMLBaseData -from doubleml.datasets import ( +from doubleml.plm.datasets import ( _make_pliv_data, make_pliv_CHS2015, make_plr_CCDDHNR2018, - make_ssm_data, ) +from doubleml.irm.datasets import make_ssm_data from doubleml.did.datasets import make_did_SZ2020 diff --git a/doubleml/datasets.py b/doubleml/datasets.py deleted file mode 100644 index 0dcd33c7c..000000000 --- a/doubleml/datasets.py +++ /dev/null @@ -1,1620 +0,0 @@ -import warnings - -import numpy as np -import pandas as pd -from scipy.linalg import toeplitz -from scipy.optimize import minimize_scalar -from sklearn.datasets import make_spd_matrix -from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures - -from doubleml.data import DoubleMLClusterData, DoubleMLData -from doubleml.utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_cluster_data_alias, _get_dml_data_alias - -_array_alias = _get_array_alias() -_data_frame_alias = _get_data_frame_alias() -_dml_data_alias = _get_dml_data_alias() -_dml_cluster_data_alias = _get_dml_cluster_data_alias() - - -def fetch_401K(return_type="DoubleMLData", polynomial_features=False): - """ - Data set on financial wealth and 401(k) plan participation. - - Parameters - ---------- - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - polynomial_features : - If ``True`` polynomial features are added (see replication files of Chernozhukov et al. (2018)). - - References - ---------- - Abadie, A. (2003), Semiparametric instrumental variable estimation of treatment response models. Journal of - Econometrics, 113(2): 231-263. - - Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), - Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. - doi:`10.1111/ectj.12097 `_. - """ - url = "https://github.com/VC2015/DMLonGitHub/raw/master/sipp1991.dta" - raw_data = pd.read_stata(url) - - y_col = "net_tfa" - d_cols = ["e401"] - x_cols = ["age", "inc", "educ", "fsize", "marr", "twoearn", "db", "pira", "hown"] - - data = raw_data.copy() - - if polynomial_features: - raise NotImplementedError("polynomial_features os not implemented yet for fetch_401K.") - - if return_type in _data_frame_alias + _dml_data_alias: - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, y_col, d_cols, x_cols) - else: - raise ValueError("Invalid return_type.") - - -def fetch_bonus(return_type="DoubleMLData", polynomial_features=False): - """ - Data set on the Pennsylvania Reemployment Bonus experiment. - - Parameters - ---------- - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - polynomial_features : - If ``True`` polynomial features are added (see replication files of Chernozhukov et al. (2018)). - - References - ---------- - Bilias Y. (2000), Sequential Testing of Duration Data: The Case of Pennsylvania 'Reemployment Bonus' Experiment. - Journal of Applied Econometrics, 15(6): 575-594. - - Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), - Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. - doi:`10.1111/ectj.12097 `_. - """ - url = "https://raw.githubusercontent.com/VC2015/DMLonGitHub/master/penn_jae.dat" - raw_data = pd.read_csv(url, sep=r"\s+") - - ind = (raw_data["tg"] == 0) | (raw_data["tg"] == 4) - data = raw_data.copy()[ind] - data.reset_index(inplace=True) - data["tg"] = data["tg"].replace(4, 1) - data["inuidur1"] = np.log(data["inuidur1"]) - - # variable dep as factor (dummy encoding) - dummy_enc = OneHotEncoder(drop="first", categories="auto").fit(data.loc[:, ["dep"]]) - xx = dummy_enc.transform(data.loc[:, ["dep"]]).toarray() - data["dep1"] = xx[:, 0] - data["dep2"] = xx[:, 1] - - y_col = "inuidur1" - d_cols = ["tg"] - x_cols = [ - "female", - "black", - "othrace", - "dep1", - "dep2", - "q2", - "q3", - "q4", - "q5", - "q6", - "agelt35", - "agegt54", - "durable", - "lusd", - "husd", - ] - - if polynomial_features: - poly = PolynomialFeatures(2, include_bias=False) - data_transf = poly.fit_transform(data[x_cols]) - x_cols = list(poly.get_feature_names_out(x_cols)) - - data_transf = pd.DataFrame(data_transf, columns=x_cols) - data = pd.concat((data[[y_col] + d_cols], data_transf), axis=1, sort=False) - - if return_type in _data_frame_alias + _dml_data_alias: - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, y_col, d_cols, x_cols) - else: - raise ValueError("Invalid return_type.") - - -def _g(x): - return np.power(np.sin(x), 2) - - -def _m(x, nu=0.0, gamma=1.0): - return 0.5 / np.pi * (np.sinh(gamma)) / (np.cosh(gamma) - np.cos(x - nu)) - - -def make_plr_CCDDHNR2018(n_obs=500, dim_x=20, alpha=0.5, return_type="DoubleMLData", **kwargs): - """ - Generates data from a partially linear regression model used in Chernozhukov et al. (2018) for Figure 1. - The data generating process is defined as - - .. math:: - - d_i &= m_0(x_i) + s_1 v_i, & &v_i \\sim \\mathcal{N}(0,1), - - y_i &= \\alpha d_i + g_0(x_i) + s_2 \\zeta_i, & &\\zeta_i \\sim \\mathcal{N}(0,1), - - - with covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries - :math:`\\Sigma_{kj} = 0.7^{|j-k|}`. - The nuisance functions are given by - - .. math:: - - m_0(x_i) &= a_0 x_{i,1} + a_1 \\frac{\\exp(x_{i,3})}{1+\\exp(x_{i,3})}, - - g_0(x_i) &= b_0 \\frac{\\exp(x_{i,1})}{1+\\exp(x_{i,1})} + b_1 x_{i,3}. - - Parameters - ---------- - n_obs : - The number of observations to simulate. - dim_x : - The number of covariates. - alpha : - The value of the causal parameter. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``. - **kwargs - Additional keyword arguments to set non-default values for the parameters - :math:`a_0=1`, :math:`a_1=0.25`, :math:`s_1=1`, :math:`b_0=1`, :math:`b_1=0.25` or :math:`s_2=1`. - - References - ---------- - Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), - Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. - doi:`10.1111/ectj.12097 `_. - """ - a_0 = kwargs.get("a_0", 1.0) - a_1 = kwargs.get("a_1", 0.25) - s_1 = kwargs.get("s_1", 1.0) - - b_0 = kwargs.get("b_0", 1.0) - b_1 = kwargs.get("b_1", 0.25) - s_2 = kwargs.get("s_2", 1.0) - - cov_mat = toeplitz([np.power(0.7, k) for k in range(dim_x)]) - x = np.random.multivariate_normal( - np.zeros(dim_x), - cov_mat, - size=[ - n_obs, - ], - ) - - d = ( - a_0 * x[:, 0] - + a_1 * np.divide(np.exp(x[:, 2]), 1 + np.exp(x[:, 2])) - + s_1 - * np.random.standard_normal( - size=[ - n_obs, - ] - ) - ) - y = ( - alpha * d - + b_0 * np.divide(np.exp(x[:, 0]), 1 + np.exp(x[:, 0])) - + b_1 * x[:, 2] - + s_2 - * np.random.standard_normal( - size=[ - n_obs, - ] - ) - ) - - if return_type in _array_alias: - return x, y, d - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((x, y, d)), columns=x_cols + ["y", "d"]) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, "y", "d", x_cols) - else: - raise ValueError("Invalid return_type.") - - -def make_plr_turrell2018(n_obs=100, dim_x=20, theta=0.5, return_type="DoubleMLData", **kwargs): - """ - Generates data from a partially linear regression model used in a blog article by Turrell (2018). - The data generating process is defined as - - .. math:: - - d_i &= m_0(x_i' b) + v_i, & &v_i \\sim \\mathcal{N}(0,1), - - y_i &= \\theta d_i + g_0(x_i' b) + u_i, & &u_i \\sim \\mathcal{N}(0,1), - - - with covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a random symmetric, - positive-definite matrix generated with :py:meth:`sklearn.datasets.make_spd_matrix`. - :math:`b` is a vector with entries :math:`b_j=\\frac{1}{j}` and the nuisance functions are given by - - .. math:: - - m_0(x_i) &= \\frac{1}{2 \\pi} \\frac{\\sinh(\\gamma)}{\\cosh(\\gamma) - \\cos(x_i-\\nu)}, - - g_0(x_i) &= \\sin(x_i)^2. - - Parameters - ---------- - n_obs : - The number of observations to simulate. - dim_x : - The number of covariates. - theta : - The value of the causal parameter. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``. - **kwargs - Additional keyword arguments to set non-default values for the parameters - :math:`\\nu=0`, or :math:`\\gamma=1`. - - References - ---------- - Turrell, A. (2018), Econometrics in Python part I - Double machine learning, Markov Wanderer: A blog on economics, - science, coding and data. `https://aeturrell.com/blog/posts/econometrics-in-python-parti-ml/ - `_. - """ - nu = kwargs.get("nu", 0.0) - gamma = kwargs.get("gamma", 1.0) - - b = [1 / k for k in range(1, dim_x + 1)] - sigma = make_spd_matrix(dim_x) - - x = np.random.multivariate_normal( - np.zeros(dim_x), - sigma, - size=[ - n_obs, - ], - ) - G = _g(np.dot(x, b)) - M = _m(np.dot(x, b), nu=nu, gamma=gamma) - d = M + np.random.standard_normal( - size=[ - n_obs, - ] - ) - y = ( - np.dot(theta, d) - + G - + np.random.standard_normal( - size=[ - n_obs, - ] - ) - ) - - if return_type in _array_alias: - return x, y, d - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((x, y, d)), columns=x_cols + ["y", "d"]) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, "y", "d", x_cols) - else: - raise ValueError("Invalid return_type.") - - -def make_irm_data(n_obs=500, dim_x=20, theta=0, R2_d=0.5, R2_y=0.5, return_type="DoubleMLData"): - """ - Generates data from a interactive regression (IRM) model. - The data generating process is defined as - - .. math:: - - d_i &= 1\\left\\lbrace \\frac{\\exp(c_d x_i' \\beta)}{1+\\exp(c_d x_i' \\beta)} > v_i \\right\\rbrace, & &v_i - \\sim \\mathcal{U}(0,1), - - y_i &= \\theta d_i + c_y x_i' \\beta d_i + \\zeta_i, & &\\zeta_i \\sim \\mathcal{N}(0,1), - - with covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries - :math:`\\Sigma_{kj} = 0.5^{|j-k|}`. - :math:`\\beta` is a `dim_x`-vector with entries :math:`\\beta_j=\\frac{1}{j^2}` and the constants :math:`c_y` and - :math:`c_d` are given by - - .. math:: - - c_y = \\sqrt{\\frac{R_y^2}{(1-R_y^2) \\beta' \\Sigma \\beta}}, \\qquad c_d = - \\sqrt{\\frac{(\\pi^2 /3) R_d^2}{(1-R_d^2) \\beta' \\Sigma \\beta}}. - - The data generating process is inspired by a process used in the simulation experiment (see Appendix P) of Belloni - et al. (2017). - - Parameters - ---------- - n_obs : - The number of observations to simulate. - dim_x : - The number of covariates. - theta : - The value of the causal parameter. - R2_d : - The value of the parameter :math:`R_d^2`. - R2_y : - The value of the parameter :math:`R_y^2`. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``. - - References - ---------- - Belloni, A., Chernozhukov, V., Fernández‐Val, I. and Hansen, C. (2017). Program Evaluation and Causal Inference With - High‐Dimensional Data. Econometrica, 85: 233-298. - """ - # inspired by https://onlinelibrary.wiley.com/doi/abs/10.3982/ECTA12723, see suplement - v = np.random.uniform( - size=[ - n_obs, - ] - ) - zeta = np.random.standard_normal( - size=[ - n_obs, - ] - ) - - cov_mat = toeplitz([np.power(0.5, k) for k in range(dim_x)]) - x = np.random.multivariate_normal( - np.zeros(dim_x), - cov_mat, - size=[ - n_obs, - ], - ) - - beta = [1 / (k**2) for k in range(1, dim_x + 1)] - b_sigma_b = np.dot(np.dot(cov_mat, beta), beta) - c_y = np.sqrt(R2_y / ((1 - R2_y) * b_sigma_b)) - c_d = np.sqrt(np.pi**2 / 3.0 * R2_d / ((1 - R2_d) * b_sigma_b)) - - xx = np.exp(np.dot(x, np.multiply(beta, c_d))) - d = 1.0 * ((xx / (1 + xx)) > v) - - y = d * theta + d * np.dot(x, np.multiply(beta, c_y)) + zeta - - if return_type in _array_alias: - return x, y, d - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((x, y, d)), columns=x_cols + ["y", "d"]) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, "y", "d", x_cols) - else: - raise ValueError("Invalid return_type.") - - -def make_iivm_data(n_obs=500, dim_x=20, theta=1.0, alpha_x=0.2, return_type="DoubleMLData"): - """ - Generates data from a interactive IV regression (IIVM) model. - The data generating process is defined as - - .. math:: - - d_i &= 1\\left\\lbrace \\alpha_x Z + v_i > 0 \\right\\rbrace, - - y_i &= \\theta d_i + x_i' \\beta + u_i, - - with :math:`Z \\sim \\text{Bernoulli}(0.5)` and - - .. math:: - - \\left(\\begin{matrix} u_i \\\\ v_i \\end{matrix} \\right) \\sim - \\mathcal{N}\\left(0, \\left(\\begin{matrix} 1 & 0.3 \\\\ 0.3 & 1 \\end{matrix} \\right) \\right). - - The covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries - :math:`\\Sigma_{kj} = 0.5^{|j-k|}` and :math:`\\beta` is a `dim_x`-vector with entries - :math:`\\beta_j=\\frac{1}{j^2}`. - - The data generating process is inspired by a process used in the simulation experiment of Farbmacher, Gruber and - Klaassen (2020). - - Parameters - ---------- - n_obs : - The number of observations to simulate. - dim_x : - The number of covariates. - theta : - The value of the causal parameter. - alpha_x : - The value of the parameter :math:`\\alpha_x`. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z)``. - - References - ---------- - Farbmacher, H., Guber, R. and Klaaßen, S. (2020). Instrument Validity Tests with Causal Forests. MEA Discussion - Paper No. 13-2020. Available at SSRN: http://dx.doi.org/10.2139/ssrn.3619201. - """ - # inspired by https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3619201 - xx = np.random.multivariate_normal( - np.zeros(2), - np.array([[1.0, 0.3], [0.3, 1.0]]), - size=[ - n_obs, - ], - ) - u = xx[:, 0] - v = xx[:, 1] - - cov_mat = toeplitz([np.power(0.5, k) for k in range(dim_x)]) - x = np.random.multivariate_normal( - np.zeros(dim_x), - cov_mat, - size=[ - n_obs, - ], - ) - - beta = [1 / (k**2) for k in range(1, dim_x + 1)] - - z = np.random.binomial( - p=0.5, - n=1, - size=[ - n_obs, - ], - ) - d = 1.0 * (alpha_x * z + v > 0) - - y = d * theta + np.dot(x, beta) + u - - if return_type in _array_alias: - return x, y, d, z - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((x, y, d, z)), columns=x_cols + ["y", "d", "z"]) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, "y", "d", x_cols, "z") - else: - raise ValueError("Invalid return_type.") - - -def _make_pliv_data(n_obs=100, dim_x=20, theta=0.5, gamma_z=0.4, return_type="DoubleMLData"): - b = [1 / k for k in range(1, dim_x + 1)] - sigma = make_spd_matrix(dim_x) - - x = np.random.multivariate_normal( - np.zeros(dim_x), - sigma, - size=[ - n_obs, - ], - ) - G = _g(np.dot(x, b)) - # instrument - z = _m(np.dot(x, b)) + np.random.standard_normal( - size=[ - n_obs, - ] - ) - # treatment - M = _m(gamma_z * z + np.dot(x, b)) - d = M + np.random.standard_normal( - size=[ - n_obs, - ] - ) - y = ( - np.dot(theta, d) - + G - + np.random.standard_normal( - size=[ - n_obs, - ] - ) - ) - - if return_type in _array_alias: - return x, y, d, z - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((x, y, d, z)), columns=x_cols + ["y", "d", "z"]) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, "y", "d", x_cols, "z") - else: - raise ValueError("Invalid return_type.") - - -def make_pliv_CHS2015(n_obs, alpha=1.0, dim_x=200, dim_z=150, return_type="DoubleMLData"): - """ - Generates data from a partially linear IV regression model used in Chernozhukov, Hansen and Spindler (2015). - The data generating process is defined as - - .. math:: - - z_i &= \\Pi x_i + \\zeta_i, - - d_i &= x_i' \\gamma + z_i' \\delta + u_i, - - y_i &= \\alpha d_i + x_i' \\beta + \\varepsilon_i, - - with - - .. math:: - - \\left(\\begin{matrix} \\varepsilon_i \\\\ u_i \\\\ \\zeta_i \\\\ x_i \\end{matrix} \\right) \\sim - \\mathcal{N}\\left(0, \\left(\\begin{matrix} 1 & 0.6 & 0 & 0 \\\\ 0.6 & 1 & 0 & 0 \\\\ - 0 & 0 & 0.25 I_{p_n^z} & 0 \\\\ 0 & 0 & 0 & \\Sigma \\end{matrix} \\right) \\right) - - where :math:`\\Sigma` is a :math:`p_n^x \\times p_n^x` matrix with entries - :math:`\\Sigma_{kj} = 0.5^{|j-k|}` and :math:`I_{p_n^z}` is the :math:`p_n^z \\times p_n^z` identity matrix. - :math:`\\beta = \\gamma` is a :math:`p_n^x`-vector with entries :math:`\\beta_j=\\frac{1}{j^2}`, - :math:`\\delta` is a :math:`p_n^z`-vector with entries :math:`\\delta_j=\\frac{1}{j^2}` - and :math:`\\Pi = (I_{p_n^z}, 0_{p_n^z \\times (p_n^x - p_n^z)})`. - - Parameters - ---------- - n_obs : - The number of observations to simulate. - alpha : - The value of the causal parameter. - dim_x : - The number of covariates. - dim_z : - The number of instruments. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z)``. - - References - ---------- - Chernozhukov, V., Hansen, C. and Spindler, M. (2015), Post-Selection and Post-Regularization Inference in Linear - Models with Many Controls and Instruments. American Economic Review: Papers and Proceedings, 105 (5): 486-90. - """ - assert dim_x >= dim_z - # see https://assets.aeaweb.org/asset-server/articles-attachments/aer/app/10505/P2015_1022_app.pdf - xx = np.random.multivariate_normal( - np.zeros(2), - np.array([[1.0, 0.6], [0.6, 1.0]]), - size=[ - n_obs, - ], - ) - epsilon = xx[:, 0] - u = xx[:, 1] - - sigma = toeplitz([np.power(0.5, k) for k in range(0, dim_x)]) - x = np.random.multivariate_normal( - np.zeros(dim_x), - sigma, - size=[ - n_obs, - ], - ) - - I_z = np.eye(dim_z) - xi = np.random.multivariate_normal( - np.zeros(dim_z), - 0.25 * I_z, - size=[ - n_obs, - ], - ) - - beta = [1 / (k**2) for k in range(1, dim_x + 1)] - gamma = beta - delta = [1 / (k**2) for k in range(1, dim_z + 1)] - Pi = np.hstack((I_z, np.zeros((dim_z, dim_x - dim_z)))) - - z = np.dot(x, np.transpose(Pi)) + xi - d = np.dot(x, gamma) + np.dot(z, delta) + u - y = alpha * d + np.dot(x, beta) + epsilon - - if return_type in _array_alias: - return x, y, d, z - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] - z_cols = [f"Z{i + 1}" for i in np.arange(dim_z)] - data = pd.DataFrame(np.column_stack((x, y, d, z)), columns=x_cols + ["y", "d"] + z_cols) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, "y", "d", x_cols, z_cols) - else: - raise ValueError("Invalid return_type.") - - -def make_pliv_multiway_cluster_CKMS2021(N=25, M=25, dim_X=100, theta=1.0, return_type="DoubleMLClusterData", **kwargs): - """ - Generates data from a partially linear IV regression model with multiway cluster sample used in Chiang et al. - (2021). The data generating process is defined as - - .. math:: - - Z_{ij} &= X_{ij}' \\xi_0 + V_{ij}, - - D_{ij} &= Z_{ij}' \\pi_{10} + X_{ij}' \\pi_{20} + v_{ij}, - - Y_{ij} &= D_{ij} \\theta + X_{ij}' \\zeta_0 + \\varepsilon_{ij}, - - with - - .. math:: - - X_{ij} &= (1 - \\omega_1^X - \\omega_2^X) \\alpha_{ij}^X - + \\omega_1^X \\alpha_{i}^X + \\omega_2^X \\alpha_{j}^X, - - \\varepsilon_{ij} &= (1 - \\omega_1^\\varepsilon - \\omega_2^\\varepsilon) \\alpha_{ij}^\\varepsilon - + \\omega_1^\\varepsilon \\alpha_{i}^\\varepsilon + \\omega_2^\\varepsilon \\alpha_{j}^\\varepsilon, - - v_{ij} &= (1 - \\omega_1^v - \\omega_2^v) \\alpha_{ij}^v - + \\omega_1^v \\alpha_{i}^v + \\omega_2^v \\alpha_{j}^v, - - V_{ij} &= (1 - \\omega_1^V - \\omega_2^V) \\alpha_{ij}^V - + \\omega_1^V \\alpha_{i}^V + \\omega_2^V \\alpha_{j}^V, - - and :math:`\\alpha_{ij}^X, \\alpha_{i}^X, \\alpha_{j}^X \\sim \\mathcal{N}(0, \\Sigma)` - where :math:`\\Sigma` is a :math:`p_x \\times p_x` matrix with entries - :math:`\\Sigma_{kj} = s_X^{|j-k|}`. - Further - - .. math:: - - \\left(\\begin{matrix} \\alpha_{ij}^\\varepsilon \\\\ \\alpha_{ij}^v \\end{matrix}\\right), - \\left(\\begin{matrix} \\alpha_{i}^\\varepsilon \\\\ \\alpha_{i}^v \\end{matrix}\\right), - \\left(\\begin{matrix} \\alpha_{j}^\\varepsilon \\\\ \\alpha_{j}^v \\end{matrix}\\right) - \\sim \\mathcal{N}\\left(0, \\left(\\begin{matrix} 1 & s_{\\varepsilon v} \\\\ - s_{\\varepsilon v} & 1 \\end{matrix} \\right) \\right) - - - and :math:`\\alpha_{ij}^V, \\alpha_{i}^V, \\alpha_{j}^V \\sim \\mathcal{N}(0, 1)`. - - Parameters - ---------- - N : - The number of observations (first dimension). - M : - The number of observations (second dimension). - dim_X : - The number of covariates. - theta : - The value of the causal parameter. - return_type : - If ``'DoubleMLClusterData'`` or ``DoubleMLClusterData``, returns a ``DoubleMLClusterData`` object where - ``DoubleMLClusterData.data`` is a ``pd.DataFrame``. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s - ``(x, y, d, cluster_vars, z)``. - **kwargs - Additional keyword arguments to set non-default values for the parameters - :math:`\\pi_{10}=1.0`, :math:`\\omega_X = \\omega_{\\varepsilon} = \\omega_V = \\omega_v = (0.25, 0.25)`, - :math:`s_X = s_{\\varepsilon v} = 0.25`, - or the :math:`p_x`-vectors :math:`\\zeta_0 = \\pi_{20} = \\xi_0` with default entries - :math:`(\\zeta_{0})_j = 0.5^j`. - - References - ---------- - Chiang, H. D., Kato K., Ma, Y. and Sasaki, Y. (2021), Multiway Cluster Robust Double/Debiased Machine Learning, - Journal of Business & Economic Statistics, - doi: `10.1080/07350015.2021.1895815 `_, - arXiv:`1909.03489 `_. - """ - # additional parameters specifiable via kwargs - pi_10 = kwargs.get("pi_10", 1.0) - - xx = np.arange(1, dim_X + 1) - zeta_0 = kwargs.get("zeta_0", np.power(0.5, xx)) - pi_20 = kwargs.get("pi_20", np.power(0.5, xx)) - xi_0 = kwargs.get("xi_0", np.power(0.5, xx)) - - omega_X = kwargs.get("omega_X", np.array([0.25, 0.25])) - omega_epsilon = kwargs.get("omega_epsilon", np.array([0.25, 0.25])) - omega_v = kwargs.get("omega_v", np.array([0.25, 0.25])) - omega_V = kwargs.get("omega_V", np.array([0.25, 0.25])) - - s_X = kwargs.get("s_X", 0.25) - s_epsilon_v = kwargs.get("s_epsilon_v", 0.25) - - # use np.tile() and np.repeat() for repeating vectors in different styles, i.e., - # np.tile([v1, v2, v3], 2) [v1, v2, v3, v1, v2, v3] - # np.repeat([v1, v2, v3], 2) [v1, v1, v2, v2, v3, v3] - - alpha_V = np.random.normal(size=(N * M)) - alpha_V_i = np.repeat(np.random.normal(size=N), M) - alpha_V_j = np.tile(np.random.normal(size=M), N) - - cov_mat = np.array([[1, s_epsilon_v], [s_epsilon_v, 1]]) - alpha_eps_v = np.random.multivariate_normal( - np.zeros(2), - cov_mat, - size=[ - N * M, - ], - ) - alpha_eps = alpha_eps_v[:, 0] - alpha_v = alpha_eps_v[:, 1] - - alpha_eps_v_i = np.random.multivariate_normal( - np.zeros(2), - cov_mat, - size=[ - N, - ], - ) - alpha_eps_i = np.repeat(alpha_eps_v_i[:, 0], M) - alpha_v_i = np.repeat(alpha_eps_v_i[:, 1], M) - - alpha_eps_v_j = np.random.multivariate_normal( - np.zeros(2), - cov_mat, - size=[ - M, - ], - ) - alpha_eps_j = np.tile(alpha_eps_v_j[:, 0], N) - alpha_v_j = np.tile(alpha_eps_v_j[:, 1], N) - - cov_mat = toeplitz([np.power(s_X, k) for k in range(dim_X)]) - alpha_X = np.random.multivariate_normal( - np.zeros(dim_X), - cov_mat, - size=[ - N * M, - ], - ) - alpha_X_i = np.repeat( - np.random.multivariate_normal( - np.zeros(dim_X), - cov_mat, - size=[ - N, - ], - ), - M, - axis=0, - ) - alpha_X_j = np.tile( - np.random.multivariate_normal( - np.zeros(dim_X), - cov_mat, - size=[ - M, - ], - ), - (N, 1), - ) - - # generate variables - x = (1 - omega_X[0] - omega_X[1]) * alpha_X + omega_X[0] * alpha_X_i + omega_X[1] * alpha_X_j - - eps = ( - (1 - omega_epsilon[0] - omega_epsilon[1]) * alpha_eps + omega_epsilon[0] * alpha_eps_i + omega_epsilon[1] * alpha_eps_j - ) - - v = (1 - omega_v[0] - omega_v[1]) * alpha_v + omega_v[0] * alpha_v_i + omega_v[1] * alpha_v_j - - V = (1 - omega_V[0] - omega_V[1]) * alpha_V + omega_V[0] * alpha_V_i + omega_V[1] * alpha_V_j - - z = np.matmul(x, xi_0) + V - d = z * pi_10 + np.matmul(x, pi_20) + v - y = d * theta + np.matmul(x, zeta_0) + eps - - cluster_cols = ["cluster_var_i", "cluster_var_j"] - cluster_vars = pd.MultiIndex.from_product([range(N), range(M)]).to_frame(name=cluster_cols).reset_index(drop=True) - - if return_type in _array_alias: - return x, y, d, cluster_vars.values, z - elif return_type in _data_frame_alias + _dml_cluster_data_alias: - x_cols = [f"X{i + 1}" for i in np.arange(dim_X)] - data = pd.concat((cluster_vars, pd.DataFrame(np.column_stack((x, y, d, z)), columns=x_cols + ["Y", "D", "Z"])), axis=1) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLClusterData(data, "Y", "D", cluster_cols, x_cols, "Z") - else: - raise ValueError("Invalid return_type.") - - -def make_confounded_irm_data(n_obs=500, theta=0.0, gamma_a=0.127, beta_a=0.58, linear=False, **kwargs): - """ - Generates counfounded data from an interactive regression model. - - The data generating process is defined as follows (inspired by the Monte Carlo simulation used - in Sant'Anna and Zhao (2020)). - - Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` corresponds - to the identity matrix. - Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, - where - - .. math:: - - \\tilde{Z}_1 &= \\exp(0.5 \\cdot X_1) - - \\tilde{Z}_2 &= 10 + X_2/(1 + \\exp(X_1)) - - \\tilde{Z}_3 &= (0.6 + X_1 \\cdot X_3 / 25)^3 - - \\tilde{Z}_4 &= (20 + X_2 + X_4)^2 - - \\tilde{Z}_5 &= X_5. - - Additionally, generate a confounder :math:`A \\sim \\mathcal{U}[-1, 1]`. - At first, define the propensity score as - - .. math:: - - m(X, A) = P(D=1|X,A) = p(Z) + \\gamma_A \\cdot A - - where - - .. math:: - - p(Z) &= \\frac{\\exp(f_{ps}(Z))}{1 + \\exp(f_{ps}(Z))}, - - f_{ps}(Z) &= 0.75 \\cdot (-Z_1 + 0.1 \\cdot Z_2 -0.25 \\cdot Z_3 - 0.1 \\cdot Z_4). - - and generate the treatment :math:`D = 1\\{m(X, A) \\ge U\\}` with :math:`U \\sim \\mathcal{U}[0, 1]`. - Since :math:`A` is independent of :math:`X`, the short form of the propensity score is given as - - .. math:: - - P(D=1|X) = p(Z). - - Further, generate the outcome of interest :math:`Y` as - - .. math:: - - Y &= \\theta \\cdot D (Z_5 + 1) + g(Z) + \\beta_A \\cdot A + \\varepsilon - - g(Z) &= 2.5 + 0.74 \\cdot Z_1 + 0.25 \\cdot Z_2 + 0.137 \\cdot (Z_3 + Z_4) - - where :math:`\\varepsilon \\sim \\mathcal{N}(0,5)`. - This implies an average treatment effect of :math:`\\theta`. Additionally, the long and short forms of - the conditional expectation take the following forms - - .. math:: - - \\mathbb{E}[Y|D, X, A] &= \\theta \\cdot D (Z_5 + 1) + g(Z) + \\beta_A \\cdot A - - \\mathbb{E}[Y|D, X] &= (\\theta + \\beta_A \\frac{\\mathrm{Cov}(A, D(Z_5 + 1))}{\\mathrm{Var}(D(Z_5 + 1))}) - \\cdot D (Z_5 + 1) + g(Z). - - Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`, which can be - set via the parameters ``gamma_a`` and ``beta_a``. - - The observed data is given as :math:`W = (Y, D, Z)`. - Further, orcale values of the confounder :math:`A`, the transformed covariated :math:`Z`, - the potential outcomes of :math:`Y`, the long and short forms of the main regression and the propensity score and - in sample versions of the confounding parameters :math:`cf_d` and :math:`cf_y` (for ATE and ATTE) - are returned in a dictionary. - - Parameters - ---------- - n_obs : int - The number of observations to simulate. - Default is ``500``. - theta : float or int - Average treatment effect. - Default is ``0.0``. - gamma_a : float - Coefficient of the unobserved confounder in the propensity score. - Default is ``0.127``. - beta_a : float - Coefficient of the unobserved confounder in the outcome regression. - Default is ``0.58``. - linear : bool - If ``True``, the Z will be set to X, such that the underlying (short) models are linear/logistic. - Default is ``False``. - - Returns - ------- - res_dict : dictionary - Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``. - - References - ---------- - Sant’Anna, P. H. and Zhao, J. (2020), - Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122. - doi:`10.1016/j.jeconom.2020.06.003 `_. - """ - c = 0.0 # the confounding strength is only valid for c=0 - xi = 0.75 - dim_x = kwargs.get("dim_x", 5) - trimming_threshold = kwargs.get("trimming_threshold", 0.01) - var_eps_y = kwargs.get("var_eps_y", 1.0) - - # Specification of main regression function - def f_reg(w): - res = 2.5 + 0.74 * w[:, 0] + 0.25 * w[:, 1] + 0.137 * (w[:, 2] + w[:, 3]) - return res - - # Specification of prop score function - def f_ps(w, xi): - res = xi * (-w[:, 0] + 0.1 * w[:, 1] - 0.25 * w[:, 2] - 0.1 * w[:, 3]) - return res - - # observed covariates - cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) - x = np.random.multivariate_normal( - np.zeros(dim_x), - cov_mat, - size=[ - n_obs, - ], - ) - z_tilde_1 = np.exp(0.5 * x[:, 0]) - z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) - z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 - z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 - z_tilde_5 = x[:, 4] - z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, z_tilde_5)) - z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) - # error terms and unobserved confounder - eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs) - # unobserved confounder - a_bounds = (-1, 1) - a = np.random.uniform(low=a_bounds[0], high=a_bounds[1], size=n_obs) - var_a = np.square(a_bounds[1] - a_bounds[0]) / 12 - - # Choose the features used in the models - if linear: - features_ps = x - features_reg = x - else: - features_ps = z - features_reg = z - - p = np.exp(f_ps(features_ps, xi)) / (1 + np.exp(f_ps(features_ps, xi))) - # compute short and long form of propensity score - m_long = p + gamma_a * a - m_short = p - # check propensity score bounds - if np.any(m_long < trimming_threshold) or np.any(m_long > 1.0 - trimming_threshold): - m_long = np.clip(m_long, trimming_threshold, 1.0 - trimming_threshold) - m_short = np.clip(m_short, trimming_threshold, 1.0 - trimming_threshold) - warnings.warn( - f"Propensity score is close to 0 or 1. " - f"Trimming is at {trimming_threshold} and {1.0 - trimming_threshold} is applied" - ) - # generate treatment based on long form - u = np.random.uniform(low=0, high=1, size=n_obs) - d = 1.0 * (m_long >= u) - # add treatment heterogeneity - d1x = z[:, 4] + 1 - var_dx = np.var(d * (d1x)) - cov_adx = gamma_a * var_a - # Outcome regression - g_partial_reg = f_reg(features_reg) - # short model - g_short_d0 = g_partial_reg - g_short_d1 = (theta + beta_a * cov_adx / var_dx) * d1x + g_partial_reg - g_short = d * g_short_d1 + (1.0 - d) * g_short_d0 - # long model - g_long_d0 = g_partial_reg + beta_a * a - g_long_d1 = theta * d1x + g_partial_reg + beta_a * a - g_long = d * g_long_d1 + (1.0 - d) * g_long_d0 - # Potential outcomes - y_0 = g_long_d0 + eps_y - y_1 = g_long_d1 + eps_y - # Realized outcome - y = d * y_1 + (1.0 - d) * y_0 - # In-sample values for confounding strength - explained_residual_variance = np.square(g_long - g_short) - residual_variance = np.square(y - g_short) - cf_y = np.mean(explained_residual_variance) / np.mean(residual_variance) - # compute the Riesz representation - treated_weight = d / np.mean(d) - untreated_weight = (1.0 - d) / np.mean(d) - # Odds ratios - propensity_ratio_long = m_long / (1.0 - m_long) - rr_long_ate = d / m_long - (1.0 - d) / (1.0 - m_long) - rr_long_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_long) - propensity_ratio_short = m_short / (1.0 - m_short) - rr_short_ate = d / m_short - (1.0 - d) / (1.0 - m_short) - rr_short_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_short) - cf_d_ate = (np.mean(1 / (m_long * (1 - m_long))) - np.mean(1 / (m_short * (1 - m_short)))) / np.mean( - 1 / (m_long * (1 - m_long)) - ) - cf_d_atte = (np.mean(propensity_ratio_long) - np.mean(propensity_ratio_short)) / np.mean(propensity_ratio_long) - if (beta_a == 0) | (gamma_a == 0): - rho_ate = 0.0 - rho_atte = 0.0 - else: - rho_ate = np.corrcoef((g_long - g_short), (rr_long_ate - rr_short_ate))[0, 1] - rho_atte = np.corrcoef((g_long - g_short), (rr_long_atte - rr_short_atte))[0, 1] - oracle_values = { - "g_long": g_long, - "g_short": g_short, - "m_long": m_long, - "m_short": m_short, - "gamma_a": gamma_a, - "beta_a": beta_a, - "a": a, - "y_0": y_0, - "y_1": y_1, - "z": z, - "cf_y": cf_y, - "cf_d_ate": cf_d_ate, - "cf_d_atte": cf_d_atte, - "rho_ate": rho_ate, - "rho_atte": rho_atte, - } - res_dict = {"x": x, "y": y, "d": d, "oracle_values": oracle_values} - return res_dict - - -def make_confounded_plr_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04, **kwargs): - """ - Generates counfounded data from an partially linear regression model. - - The data generating process is defined as follows (similar to the Monte Carlo simulation used - in Sant'Anna and Zhao (2020)). Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, - where :math:`\\Sigma` is a matrix with entries - :math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is :math:`c = 0`, corresponding to the identity matrix. - Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, - where - - .. math:: - - \\tilde{Z}_1 &= \\exp(0.5 \\cdot X_1) - - \\tilde{Z}_2 &= 10 + X_2/(1 + \\exp(X_1)) - - \\tilde{Z}_3 &= (0.6 + X_1 \\cdot X_3 / 25)^3 - - \\tilde{Z}_4 &= (20 + X_2 + X_4)^2. - - Additionally, generate a confounder :math:`A \\sim \\mathcal{U}[-1, 1]`. - At first, define the treatment as - - .. math:: - - D = -Z_1 + 0.5 \\cdot Z_2 - 0.25 \\cdot Z_3 - 0.1 \\cdot Z_4 + \\gamma_A \\cdot A + \\varepsilon_D - - and with :math:`\\varepsilon \\sim \\mathcal{N}(0,1)`. - Since :math:`A` is independent of :math:`X`, the long and short form of the treatment regression are given as - - .. math:: - - E[D|X,A] = -Z_1 + 0.5 \\cdot Z_2 - 0.25 \\cdot Z_3 - 0.1 \\cdot Z_4 + \\gamma_A \\cdot A - - E[D|X] = -Z_1 + 0.5 \\cdot Z_2 - 0.25 \\cdot Z_3 - 0.1 \\cdot Z_4. - - Further, generate the outcome of interest :math:`Y` as - - .. math:: - - Y &= \\theta \\cdot D + g(Z) + \\beta_A \\cdot A + \\varepsilon - - g(Z) &= 210 + 27.4 \\cdot Z_1 +13.7 \\cdot (Z_2 + Z_3 + Z_4) - - where :math:`\\varepsilon \\sim \\mathcal{N}(0,5)`. - This implies an average treatment effect of :math:`\\theta`. Additionally, the long and short forms of - the conditional expectation take the following forms - - .. math:: - - \\mathbb{E}[Y|D, X, A] &= \\theta \\cdot D + g(Z) + \\beta_A \\cdot A - - \\mathbb{E}[Y|D, X] &= (\\theta + \\gamma_A\\beta_A \\frac{\\mathrm{Var}(A)}{\\mathrm{Var}(D)}) \\cdot D + g(Z). - - Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`. - Both are chosen to obtain the desired confounding of the outcome and Riesz Representer (in sample). - - The observed data is given as :math:`W = (Y, D, X)`. - Further, orcale values of the confounder :math:`A`, the transformed covariated :math:`Z`, the effect :math:`\\theta`, - the coefficients :math:`\\gamma_a`, :math:`\\beta_a`, the long and short forms of the main regression and - the propensity score are returned in a dictionary. - - Parameters - ---------- - n_obs : int - The number of observations to simulate. - Default is ``500``. - theta : float or int - Average treatment effect. - Default is ``5.0``. - cf_y : float - Percentage of the residual variation of the outcome explained by latent/confounding variable. - Default is ``0.04``. - cf_d : float - Percentage gains in the variation of the Riesz Representer generated by latent/confounding variable. - Default is ``0.04``. - - Returns - ------- - res_dict : dictionary - Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``. - - References - ---------- - Sant’Anna, P. H. and Zhao, J. (2020), - Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122. - doi:`10.1016/j.jeconom.2020.06.003 `_. - """ - c = kwargs.get("c", 0.0) - dim_x = kwargs.get("dim_x", 4) - - # observed covariates - cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) - x = np.random.multivariate_normal( - np.zeros(dim_x), - cov_mat, - size=[ - n_obs, - ], - ) - - z_tilde_1 = np.exp(0.5 * x[:, 0]) - z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) - z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 - z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 - - z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, x[:, 4:])) - z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) - - # error terms - var_eps_y = 5 - eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs) - var_eps_d = 1 - eps_d = np.random.normal(loc=0, scale=np.sqrt(var_eps_d), size=n_obs) - - # unobserved confounder - a_bounds = (-1, 1) - a = np.random.uniform(low=a_bounds[0], high=a_bounds[1], size=n_obs) - var_a = np.square(a_bounds[1] - a_bounds[0]) / 12 - - # get the required impact of the confounder on the propensity score - m_short = -z[:, 0] + 0.5 * z[:, 1] - 0.25 * z[:, 2] - 0.1 * z[:, 3] - - def f_m(gamma_a): - rr_long = eps_d / var_eps_d - rr_short = (gamma_a * a + eps_d) / (gamma_a**2 * var_a + var_eps_d) - C2_D = (np.mean(np.square(rr_long)) - np.mean(np.square(rr_short))) / np.mean(np.square(rr_short)) - return np.square(C2_D / (1 + C2_D) - cf_d) - - gamma_a = minimize_scalar(f_m).x - m_long = m_short + gamma_a * a - d = m_long + eps_d - - # short and long version of g - g_partial_reg = 210 + 27.4 * z[:, 0] + 13.7 * (z[:, 1] + z[:, 2] + z[:, 3]) - - var_d = np.var(d) - - def f_g(beta_a): - g_diff = beta_a * (a - gamma_a * (var_a / var_d) * d) - y_diff = eps_y + g_diff - return np.square(np.mean(np.square(g_diff)) / np.mean(np.square(y_diff)) - cf_y) - - beta_a = minimize_scalar(f_g).x - - g_long = theta * d + g_partial_reg + beta_a * a - g_short = (theta + gamma_a * beta_a * var_a / var_d) * d + g_partial_reg - - y = g_long + eps_y - - oracle_values = { - "g_long": g_long, - "g_short": g_short, - "m_long": m_long, - "m_short": m_short, - "theta": theta, - "gamma_a": gamma_a, - "beta_a": beta_a, - "a": a, - "z": z, - } - - res_dict = {"x": x, "y": y, "d": d, "oracle_values": oracle_values} - - return res_dict - - -def make_heterogeneous_data(n_obs=200, p=30, support_size=5, n_x=1, binary_treatment=False): - """ - Creates a simple synthetic example for heterogeneous treatment effects. - The data generating process is based on the Monte Carlo simulation from Oprescu et al. (2019). - - The data is generated as - - .. math:: - - Y_i & = \\theta_0(X_i)D_i + \\langle X_i,\\gamma_0\\rangle + \\epsilon_i - - D_i & = \\langle X_i,\\beta_0\\rangle + \\eta_i, - - where :math:`X_i\\sim\\mathcal{U}[0,1]^{p}` and :math:`\\epsilon_i,\\eta_i - \\sim\\mathcal{U}[-1,1]`. - If the treatment is set to be binary, the treatment is generated as - - .. math:: - D_i = 1\\{\\langle X_i,\\beta_0\\rangle \\ge \\eta_i\\}. - - The coefficient vectors :math:`\\gamma_0` and :math:`\\beta_0` both have small random (identical) support - which values are drawn independently from :math:`\\mathcal{U}[0,1]` and :math:`\\mathcal{U}[0,0.3]`. - Further, :math:`\\theta_0(x)` defines the conditional treatment effect, which is defined differently depending - on the dimension of :math:`x`. - - If the heterogeneity is univariate the conditional treatment effect takes the following form - - .. math:: - \\theta_0(x) = \\exp(2x_0) + 3\\sin(4x_0), - - whereas for the two-dimensional case the conditional treatment effect is defined as - - .. math:: - \\theta_0(x) = \\exp(2x_0) + 3\\sin(4x_1). - - Parameters - ---------- - n_obs : int - Number of observations to simulate. - Default is ``200``. - - p : int - Dimension of covariates. - Default is ``30``. - - support_size : int - Number of relevant (confounding) covariates. - Default is ``5``. - - n_x : int - Dimension of the heterogeneity. Can be either ``1`` or ``2``. - Default is ``1``. - - binary_treatment : bool - Indicates whether the treatment is binary. - Default is ``False``. - - Returns - ------- - res_dict : dictionary - Dictionary with entries ``data``, ``effects``, ``treatment_effect``. - - """ - # simple input checks - assert n_x in [1, 2], "n_x must be either 1 or 2." - assert support_size <= p, "support_size must be smaller than p." - assert isinstance(binary_treatment, bool), "binary_treatment must be a boolean." - - # define treatment effects - if n_x == 1: - - def treatment_effect(x): - return np.exp(2 * x[:, 0]) + 3 * np.sin(4 * x[:, 0]) - - else: - assert n_x == 2 - - # redefine treatment effect - def treatment_effect(x): - return np.exp(2 * x[:, 0]) + 3 * np.sin(4 * x[:, 1]) - - # Outcome support and coefficients - support_y = np.random.choice(np.arange(p), size=support_size, replace=False) - coefs_y = np.random.uniform(0, 1, size=support_size) - # treatment support and coefficients - support_d = support_y - coefs_d = np.random.uniform(0, 0.3, size=support_size) - - # noise - epsilon = np.random.uniform(-1, 1, size=n_obs) - eta = np.random.uniform(-1, 1, size=n_obs) - - # Generate controls, covariates, treatments and outcomes - x = np.random.uniform(0, 1, size=(n_obs, p)) - # Heterogeneous treatment effects - te = treatment_effect(x) - if binary_treatment: - d = 1.0 * (np.dot(x[:, support_d], coefs_d) >= eta) - else: - d = np.dot(x[:, support_d], coefs_d) + eta - y = te * d + np.dot(x[:, support_y], coefs_y) + epsilon - - # Now we build the dataset - y_df = pd.DataFrame({"y": y}) - d_df = pd.DataFrame({"d": d}) - x_df = pd.DataFrame(data=x, index=np.arange(x.shape[0]), columns=[f"X_{i}" for i in range(x.shape[1])]) - - data = pd.concat([y_df, d_df, x_df], axis=1) - res_dict = {"data": data, "effects": te, "treatment_effect": treatment_effect} - return res_dict - - -def make_ssm_data(n_obs=8000, dim_x=100, theta=1, mar=True, return_type="DoubleMLData"): - """ - Generates data from a sample selection model (SSM). - The data generating process is defined as - - .. math:: - - y_i &= \\theta d_i + x_i' \\beta d_i + u_i, - - s_i &= 1\\left\\lbrace d_i + \\gamma z_i + x_i' \\beta + v_i > 0 \\right\\rbrace, - - d_i &= 1\\left\\lbrace x_i' \\beta + w_i > 0 \\right\\rbrace, - - with Y being observed if :math:`s_i = 1` and covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma^2_x)`, where - :math:`\\Sigma^2_x` is a matrix with entries - :math:`\\Sigma_{kj} = 0.5^{|j-k|}`. - :math:`\\beta` is a `dim_x`-vector with entries :math:`\\beta_j=\\frac{0.4}{j^2}` - :math:`z_i \\sim \\mathcal{N}(0, 1)`, - :math:`(u_i,v_i) \\sim \\mathcal{N}(0, \\Sigma^2_{u,v})`, - :math:`w_i \\sim \\mathcal{N}(0, 1)`. - - - The data generating process is inspired by a process used in the simulation study (see Appendix E) of Bia, - Huber and Lafférs (2023). - - Parameters - ---------- - n_obs : - The number of observations to simulate. - dim_x : - The number of covariates. - theta : - The value of the causal parameter. - mar: - Boolean. Indicates whether missingness at random holds. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z, s)``. - - References - ---------- - Michela Bia, Martin Huber & Lukáš Lafférs (2023) Double Machine Learning for Sample Selection Models, - Journal of Business & Economic Statistics, DOI: 10.1080/07350015.2023.2271071 - """ - if mar: - sigma = np.array([[1, 0], [0, 1]]) - gamma = 0 - else: - sigma = np.array([[1, 0.8], [0.8, 1]]) - gamma = 1 - - e = np.random.multivariate_normal(mean=[0, 0], cov=sigma, size=n_obs).T - - cov_mat = toeplitz([np.power(0.5, k) for k in range(dim_x)]) - x = np.random.multivariate_normal( - np.zeros(dim_x), - cov_mat, - size=[ - n_obs, - ], - ) - - beta = [0.4 / (k**2) for k in range(1, dim_x + 1)] - - d = np.where(np.dot(x, beta) + np.random.randn(n_obs) > 0, 1, 0) - z = np.random.randn(n_obs) - s = np.where(np.dot(x, beta) + d + gamma * z + e[0] > 0, 1, 0) - - y = np.dot(x, beta) + theta * d + e[1] - y[s == 0] = 0 - - if return_type in _array_alias: - return x, y, d, z, s - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] - if mar: - data = pd.DataFrame(np.column_stack((x, y, d, s)), columns=x_cols + ["y", "d", "s"]) - else: - data = pd.DataFrame(np.column_stack((x, y, d, z, s)), columns=x_cols + ["y", "d", "z", "s"]) - if return_type in _data_frame_alias: - return data - else: - if mar: - return DoubleMLData(data, "y", "d", x_cols, None, None, "s") - return DoubleMLData(data, "y", "d", x_cols, "z", None, "s") - else: - raise ValueError("Invalid return_type.") - - -def make_irm_data_discrete_treatments(n_obs=200, n_levels=3, linear=False, random_state=None, **kwargs): - """ - Generates data from a interactive regression (IRM) model with multiple treatment levels (based on an - underlying continous treatment). - - The data generating process is defined as follows (similar to the Monte Carlo simulation used - in Sant'Anna and Zhao (2020)). - - Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` corresponds - to the identity matrix. - Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, - where - - .. math:: - - \\tilde{Z}_1 &= \\exp(0.5 \\cdot X_1) - - \\tilde{Z}_2 &= 10 + X_2/(1 + \\exp(X_1)) - - \\tilde{Z}_3 &= (0.6 + X_1 \\cdot X_3 / 25)^3 - - \\tilde{Z}_4 &= (20 + X_2 + X_4)^2 - - \\tilde{Z}_5 &= X_5. - - A continuous treatment :math:`D_{\\text{cont}}` is generated as - - .. math:: - - D_{\\text{cont}} = \\xi (-Z_1 + 0.5 Z_2 - 0.25 Z_3 - 0.1 Z_4) + \\varepsilon_D, - - where :math:`\\varepsilon_D \\sim \\mathcal{N}(0,1)` and :math:`\\xi=0.3`. The corresponding treatment - effect is defined as - - .. math:: - - \\theta (d) = 0.1 \\exp(d) + 10 \\sin(0.7 d) + 2 d - 0.2 d^2. - - Based on the continous treatment, a discrete treatment :math:`D` is generated as with a baseline level of - :math:`D=0` and additional levels based on the quantiles of :math:`D_{\\text{cont}}`. The number of levels - is defined by :math:`n_{\\text{levels}}`. Each level is chosen to have the same probability of being selected. - - The potential outcomes are defined as - - .. math:: - - Y(0) &= 210 + 27.4 Z_1 + 13.7 (Z_2 + Z_3 + Z_4) + \\varepsilon_Y - - Y(1) &= \\theta (D_{\\text{cont}}) 1\\{D_{\\text{cont}} > 0\\} + Y(0), - - where :math:`\\varepsilon_Y \\sim \\mathcal{N}(0,5)`. Further, the observed outcome is defined as - - .. math:: - - Y = Y(1) 1\\{D > 0\\} + Y(0) 1\\{D = 0\\}. - - The data is returned as a dictionary with the entries ``x``, ``y``, ``d`` and ``oracle_values``. - - Parameters - ---------- - n_obs : int - The number of observations to simulate. - Default is ``200``. - - n_levels : int - The number of treatment levels. - Default is ``3``. - - linear : bool - Indicates whether the true underlying regression is linear. - Default is ``False``. - - random_state : int - Random seed for reproducibility. - Default is ``42``. - - Returns - ------- - res_dict : dictionary - Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``. - The oracle values contain the continuous treatment, the level bounds, the potential level, ITE - and the potential outcome without treatment. - - """ - if random_state is not None: - np.random.seed(random_state) - xi = kwargs.get("xi", 0.3) - c = kwargs.get("c", 0.0) - dim_x = kwargs.get("dim_x", 5) - - if not isinstance(n_levels, int): - raise ValueError("n_levels must be an integer.") - if n_levels < 2: - raise ValueError("n_levels must be at least 2.") - - # observed covariates - cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) - x = np.random.multivariate_normal( - np.zeros(dim_x), - cov_mat, - size=[ - n_obs, - ], - ) - - def f_reg(w): - res = 210 + 27.4 * w[:, 0] + 13.7 * (w[:, 1] + w[:, 2] + w[:, 3]) - return res - - def f_treatment(w, xi): - res = xi * (-w[:, 0] + 0.5 * w[:, 1] - 0.25 * w[:, 2] - 0.1 * w[:, 3]) - return res - - def treatment_effect(d, scale=15): - return scale * (1 / (1 + np.exp(-d - 1.2 * np.cos(d)))) - 2 - - z_tilde_1 = np.exp(0.5 * x[:, 0]) - z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) - z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 - z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 - - z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, x[:, 4:])) - z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) - - # error terms - var_eps_y = 5 - eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs) - var_eps_d = 1 - eps_d = np.random.normal(loc=0, scale=np.sqrt(var_eps_d), size=n_obs) - - if linear: - g = f_reg(x) - m = f_treatment(x, xi) - else: - assert not linear - g = f_reg(z) - m = f_treatment(z, xi) - - cont_d = m + eps_d - level_bounds = np.quantile(cont_d, q=np.linspace(0, 1, n_levels + 1)) - potential_level = sum([1.0 * (cont_d >= bound) for bound in level_bounds[1:-1]]) + 1 - eta = np.random.uniform(0, 1, size=n_obs) - d = 1.0 * (eta >= 1 / n_levels) * potential_level - - ite = treatment_effect(cont_d) - y0 = g + eps_y - # only treated for d > 0 compared to the baseline - y = ite * (d > 0) + y0 - - oracle_values = { - "cont_d": cont_d, - "level_bounds": level_bounds, - "potential_level": potential_level, - "ite": ite, - "y0": y0, - } - - resul_dict = {"x": x, "y": y, "d": d, "oracle_values": oracle_values} - - return resul_dict diff --git a/doubleml/datasets/__init__.py b/doubleml/datasets/__init__.py new file mode 100644 index 000000000..6a64a5c8d --- /dev/null +++ b/doubleml/datasets/__init__.py @@ -0,0 +1,13 @@ +""" +The :mod:`doubleml.datasets` module implements data generating processes for double machine learning simulations and provides access to real datasets. +""" + +# Import fetch functions +from .fetch_401K import fetch_401K +from .fetch_bonus import fetch_bonus + + +__all__ = [ + "fetch_401K", + "fetch_bonus", +] diff --git a/doubleml/datasets/fetch_401K.py b/doubleml/datasets/fetch_401K.py new file mode 100644 index 000000000..05a97fe73 --- /dev/null +++ b/doubleml/datasets/fetch_401K.py @@ -0,0 +1,65 @@ +""" +Data set on financial wealth and 401(k) plan participation. +""" + +import pandas as pd +from doubleml import DoubleMLData + + +def _get_array_alias(): + return ["array", "np.array", "np.ndarray"] + + +def _get_data_frame_alias(): + return ["DataFrame", "pd.DataFrame", "pandas.DataFrame"] + + +def _get_dml_data_alias(): + return ["DoubleMLData"] + + +def fetch_401K(return_type="DoubleMLData", polynomial_features=False): + """ + Data set on financial wealth and 401(k) plan participation. + + Parameters + ---------- + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + polynomial_features : + If ``True`` polynomial features are added (see replication files of Chernozhukov et al. (2018)). + + References + ---------- + Abadie, A. (2003), Semiparametric instrumental variable estimation of treatment response models. Journal of + Econometrics, 113(2): 231-263. + + Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), + Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. + doi:`10.1111/ectj.12097 `_. + """ + _array_alias = _get_array_alias() + _data_frame_alias = _get_data_frame_alias() + _dml_data_alias = _get_dml_data_alias() + + url = "https://github.com/VC2015/DMLonGitHub/raw/master/sipp1991.dta" + raw_data = pd.read_stata(url) + + y_col = "net_tfa" + d_cols = ["e401"] + x_cols = ["age", "inc", "educ", "fsize", "marr", "twoearn", "db", "pira", "hown"] + + data = raw_data.copy() + + if polynomial_features: + raise NotImplementedError("polynomial_features os not implemented yet for fetch_401K.") + + if return_type in _data_frame_alias + _dml_data_alias: + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, y_col, d_cols, x_cols) + else: + raise ValueError("Invalid return_type.") diff --git a/doubleml/datasets/fetch_bonus.py b/doubleml/datasets/fetch_bonus.py new file mode 100644 index 000000000..155100c31 --- /dev/null +++ b/doubleml/datasets/fetch_bonus.py @@ -0,0 +1,98 @@ +""" +Data set on the Pennsylvania Reemployment Bonus experiment. +""" + +import numpy as np +import pandas as pd +from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures +from doubleml import DoubleMLData + + +def _get_array_alias(): + return ["array", "np.array", "np.ndarray"] + + +def _get_data_frame_alias(): + return ["DataFrame", "pd.DataFrame", "pandas.DataFrame"] + + +def _get_dml_data_alias(): + return ["DoubleMLData"] + + +def fetch_bonus(return_type="DoubleMLData", polynomial_features=False): + """ + Data set on the Pennsylvania Reemployment Bonus experiment. + + Parameters + ---------- + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + polynomial_features : + If ``True`` polynomial features are added (see replication files of Chernozhukov et al. (2018)). + + References + ---------- + Bilias Y. (2000), Sequential Testing of Duration Data: The Case of Pennsylvania 'Reemployment Bonus' Experiment. + Journal of Applied Econometrics, 15(6): 575-594. + + Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), + Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. + doi:`10.1111/ectj.12097 `_. + """ + _array_alias = _get_array_alias() + _data_frame_alias = _get_data_frame_alias() + _dml_data_alias = _get_dml_data_alias() + + url = "https://raw.githubusercontent.com/VC2015/DMLonGitHub/master/penn_jae.dat" + raw_data = pd.read_csv(url, sep=r"\s+") + + ind = (raw_data["tg"] == 0) | (raw_data["tg"] == 4) + data = raw_data.copy()[ind] + data.reset_index(inplace=True) + data["tg"] = data["tg"].replace(4, 1) + data["inuidur1"] = np.log(data["inuidur1"]) + + # variable dep as factor (dummy encoding) + dummy_enc = OneHotEncoder(drop="first", categories="auto").fit(data.loc[:, ["dep"]]) + xx = dummy_enc.transform(data.loc[:, ["dep"]]).toarray() + data["dep1"] = xx[:, 0] + data["dep2"] = xx[:, 1] + + y_col = "inuidur1" + d_cols = ["tg"] + x_cols = [ + "female", + "black", + "othrace", + "dep1", + "dep2", + "q2", + "q3", + "q4", + "q5", + "q6", + "agelt35", + "agegt54", + "durable", + "lusd", + "husd", + ] + + if polynomial_features: + poly = PolynomialFeatures(2, include_bias=False) + data_transf = poly.fit_transform(data[x_cols]) + x_cols = list(poly.get_feature_names_out(x_cols)) + + data_transf = pd.DataFrame(data_transf, columns=x_cols) + data = pd.concat((data[[y_col] + d_cols], data_transf), axis=1, sort=False) + + if return_type in _data_frame_alias + _dml_data_alias: + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, y_col, d_cols, x_cols) + else: + raise ValueError("Invalid return_type.") diff --git a/doubleml/did/did.py b/doubleml/did/did.py index 7a671993c..170535ea4 100644 --- a/doubleml/did/did.py +++ b/doubleml/did/did.py @@ -63,10 +63,9 @@ class DoubleMLDID(LinearScoreMixin, DoubleML): Default is ``True``. Examples - -------- - >>> import numpy as np + -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_did_SZ2020 + >>> from doubleml.did.datasets import make_did_SZ2020 >>> from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier >>> np.random.seed(42) >>> ml_g = RandomForestRegressor(n_estimators=100, max_depth=5, min_samples_leaf=5) diff --git a/doubleml/did/did_cs.py b/doubleml/did/did_cs.py index ab2af5b95..7f33210fe 100644 --- a/doubleml/did/did_cs.py +++ b/doubleml/did/did_cs.py @@ -63,10 +63,9 @@ class DoubleMLDIDCS(LinearScoreMixin, DoubleML): Default is ``True``. Examples - -------- - >>> import numpy as np + -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_did_SZ2020 + >>> from doubleml.did.datasets import make_did_SZ2020 >>> from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier >>> np.random.seed(42) >>> ml_g = RandomForestRegressor(n_estimators=100, max_depth=5, min_samples_leaf=5) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 764865a43..fe4cec5de 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -1167,10 +1167,9 @@ def evaluate_learners(self, learners=None, metric=_rmse): Examples -------- - >>> import numpy as np - >>> import doubleml as dml + >>> import numpy as np >>> import doubleml as dml >>> from sklearn.metrics import mean_absolute_error - >>> from doubleml.datasets import make_irm_data + >>> from doubleml.irm.datasets import make_irm_data >>> from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier >>> np.random.seed(3141) >>> ml_g = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) @@ -1284,10 +1283,9 @@ def set_sample_splitting(self, all_smpls, all_smpls_cluster=None): self : object Examples - -------- - >>> import numpy as np + -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_plr_CCDDHNR2018 + >>> from doubleml.plm.datasets import make_plr_CCDDHNR2018 >>> from sklearn.ensemble import RandomForestRegressor >>> from sklearn.base import clone >>> np.random.seed(3141) diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py index 8099342aa..2960e90d3 100644 --- a/doubleml/irm/apos.py +++ b/doubleml/irm/apos.py @@ -673,7 +673,7 @@ def set_sample_splitting(self, all_smpls, all_smpls_cluster=None): -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_plr_CCDDHNR2018 + >>> from doubleml.plm.datasets import make_plr_CCDDHNR2018 >>> from sklearn.ensemble import RandomForestRegressor >>> from sklearn.base import clone >>> np.random.seed(3141) diff --git a/doubleml/irm/cvar.py b/doubleml/irm/cvar.py index d2aeaced6..57347dce9 100644 --- a/doubleml/irm/cvar.py +++ b/doubleml/irm/cvar.py @@ -82,7 +82,7 @@ class DoubleMLCVAR(LinearScoreMixin, DoubleML): -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_irm_data + >>> from doubleml.irm.datasets import make_irm_data >>> from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor >>> np.random.seed(3141) >>> ml_g = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=10, min_samples_leaf=2) diff --git a/doubleml/irm/datasets/__init__.py b/doubleml/irm/datasets/__init__.py new file mode 100644 index 000000000..05f951343 --- /dev/null +++ b/doubleml/irm/datasets/__init__.py @@ -0,0 +1,20 @@ +""" +The :mod:`doubleml.irm.datasets` module implements data generating processes for interactive regression models. +""" + +from .dgp_confounded_irm_data import make_confounded_irm_data +from .dgp_heterogeneous_data import make_heterogeneous_data +from .dgp_iivm_data import make_iivm_data +from .dgp_irm_data import make_irm_data +from .dgp_irm_data_discrete_treatments import make_irm_data_discrete_treatments +from .dgp_ssm_data import make_ssm_data + + +__all__ = [ + "make_confounded_irm_data", + "make_heterogeneous_data", + "make_iivm_data", + "make_irm_data", + "make_irm_data_discrete_treatments", + "make_ssm_data", +] diff --git a/doubleml/irm/datasets/dgp_confounded_irm_data.py b/doubleml/irm/datasets/dgp_confounded_irm_data.py new file mode 100644 index 000000000..2452e896b --- /dev/null +++ b/doubleml/irm/datasets/dgp_confounded_irm_data.py @@ -0,0 +1,232 @@ +import numpy as np +import warnings +from scipy.linalg import toeplitz + + +def make_confounded_irm_data(n_obs=500, theta=0.0, gamma_a=0.127, beta_a=0.58, linear=False, **kwargs): + """ + Generates counfounded data from an interactive regression model. + + The data generating process is defined as follows (inspired by the Monte Carlo simulation used + in Sant'Anna and Zhao (2020)). + + Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` corresponds + to the identity matrix. + Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, + where + + .. math:: + + \\tilde{Z}_1 &= \\exp(0.5 \\cdot X_1) + + \\tilde{Z}_2 &= 10 + X_2/(1 + \\exp(X_1)) + + \\tilde{Z}_3 &= (0.6 + X_1 \\cdot X_3 / 25)^3 + + \\tilde{Z}_4 &= (20 + X_2 + X_4)^2 + + \\tilde{Z}_5 &= X_5. + + Additionally, generate a confounder :math:`A \\sim \\mathcal{U}[-1, 1]`. + At first, define the propensity score as + + .. math:: + + m(X, A) = P(D=1|X,A) = p(Z) + \\gamma_A \\cdot A + + where + + .. math:: + + p(Z) &= \\frac{\\exp(f_{ps}(Z))}{1 + \\exp(f_{ps}(Z))}, + + f_{ps}(Z) &= 0.75 \\cdot (-Z_1 + 0.1 \\cdot Z_2 -0.25 \\cdot Z_3 - 0.1 \\cdot Z_4). + + and generate the treatment :math:`D = 1\\{m(X, A) \\ge U\\}` with :math:`U \\sim \\mathcal{U}[0, 1]`. + Since :math:`A` is independent of :math:`X`, the short form of the propensity score is given as + + .. math:: + + P(D=1|X) = p(Z). + + Further, generate the outcome of interest :math:`Y` as + + .. math:: + + Y &= \\theta \\cdot D (Z_5 + 1) + g(Z) + \\beta_A \\cdot A + \\varepsilon + + g(Z) &= 2.5 + 0.74 \\cdot Z_1 + 0.25 \\cdot Z_2 + 0.137 \\cdot (Z_3 + Z_4) + + where :math:`\\varepsilon \\sim \\mathcal{N}(0,5)`. + This implies an average treatment effect of :math:`\\theta`. Additionally, the long and short forms of + the conditional expectation take the following forms + + .. math:: + + \\mathbb{E}[Y|D, X, A] &= \\theta \\cdot D (Z_5 + 1) + g(Z) + \\beta_A \\cdot A + + \\mathbb{E}[Y|D, X] &= (\\theta + \\beta_A \\frac{\\mathrm{Cov}(A, D(Z_5 + 1))}{\\mathrm{Var}(D(Z_5 + 1))}) + \\cdot D (Z_5 + 1) + g(Z). + + Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`, which can be + set via the parameters ``gamma_a`` and ``beta_a``. + + The observed data is given as :math:`W = (Y, D, Z)`. + Further, orcale values of the confounder :math:`A`, the transformed covariated :math:`Z`, + the potential outcomes of :math:`Y`, the long and short forms of the main regression and the propensity score and + in sample versions of the confounding parameters :math:`cf_d` and :math:`cf_y` (for ATE and ATTE) + are returned in a dictionary. + + Parameters + ---------- + n_obs : int + The number of observations to simulate. + Default is ``500``. + theta : float or int + Average treatment effect. + Default is ``0.0``. + gamma_a : float + Coefficient of the unobserved confounder in the propensity score. + Default is ``0.127``. + beta_a : float + Coefficient of the unobserved confounder in the outcome regression. + Default is ``0.58``. + linear : bool + If ``True``, the Z will be set to X, such that the underlying (short) models are linear/logistic. + Default is ``False``. + + Returns + ------- + res_dict : dictionary + Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``. + + References + ---------- + Sant'Anna, P. H. and Zhao, J. (2020), + Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122. + doi:`10.1016/j.jeconom.2020.06.003 `_. + """ + c = 0.0 # the confounding strength is only valid for c=0 + xi = 0.75 + dim_x = kwargs.get("dim_x", 5) + trimming_threshold = kwargs.get("trimming_threshold", 0.01) + var_eps_y = kwargs.get("var_eps_y", 1.0) + + # Specification of main regression function + def f_reg(w): + res = 2.5 + 0.74 * w[:, 0] + 0.25 * w[:, 1] + 0.137 * (w[:, 2] + w[:, 3]) + return res + + # Specification of prop score function + def f_ps(w, xi): + res = xi * (-w[:, 0] + 0.1 * w[:, 1] - 0.25 * w[:, 2] - 0.1 * w[:, 3]) + return res + + # observed covariates + cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) + x = np.random.multivariate_normal( + np.zeros(dim_x), + cov_mat, + size=[ + n_obs, + ], + ) + z_tilde_1 = np.exp(0.5 * x[:, 0]) + z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) + z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 + z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 + z_tilde_5 = x[:, 4] + z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, z_tilde_5)) + z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) + # error terms and unobserved confounder + eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs) + # unobserved confounder + a_bounds = (-1, 1) + a = np.random.uniform(low=a_bounds[0], high=a_bounds[1], size=n_obs) + var_a = np.square(a_bounds[1] - a_bounds[0]) / 12 + + # Choose the features used in the models + if linear: + features_ps = x + features_reg = x + else: + features_ps = z + features_reg = z + + p = np.exp(f_ps(features_ps, xi)) / (1 + np.exp(f_ps(features_ps, xi))) + # compute short and long form of propensity score + m_long = p + gamma_a * a + m_short = p + # check propensity score bounds + if np.any(m_long < trimming_threshold) or np.any(m_long > 1.0 - trimming_threshold): + m_long = np.clip(m_long, trimming_threshold, 1.0 - trimming_threshold) + m_short = np.clip(m_short, trimming_threshold, 1.0 - trimming_threshold) + warnings.warn( + f"Propensity score is close to 0 or 1. " + f"Trimming is at {trimming_threshold} and {1.0 - trimming_threshold} is applied" + ) + # generate treatment based on long form + u = np.random.uniform(low=0, high=1, size=n_obs) + d = 1.0 * (m_long >= u) + # add treatment heterogeneity + d1x = z[:, 4] + 1 + var_dx = np.var(d * (d1x)) + cov_adx = gamma_a * var_a + # Outcome regression + g_partial_reg = f_reg(features_reg) + # short model + g_short_d0 = g_partial_reg + g_short_d1 = (theta + beta_a * cov_adx / var_dx) * d1x + g_partial_reg + g_short = d * g_short_d1 + (1.0 - d) * g_short_d0 + # long model + g_long_d0 = g_partial_reg + beta_a * a + g_long_d1 = theta * d1x + g_partial_reg + beta_a * a + g_long = d * g_long_d1 + (1.0 - d) * g_long_d0 + # Potential outcomes + y_0 = g_long_d0 + eps_y + y_1 = g_long_d1 + eps_y + # Realized outcome + y = d * y_1 + (1.0 - d) * y_0 + # In-sample values for confounding strength + explained_residual_variance = np.square(g_long - g_short) + residual_variance = np.square(y - g_short) + cf_y = np.mean(explained_residual_variance) / np.mean(residual_variance) + # compute the Riesz representation + treated_weight = d / np.mean(d) + untreated_weight = (1.0 - d) / np.mean(d) + # Odds ratios + propensity_ratio_long = m_long / (1.0 - m_long) + rr_long_ate = d / m_long - (1.0 - d) / (1.0 - m_long) + rr_long_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_long) + propensity_ratio_short = m_short / (1.0 - m_short) + rr_short_ate = d / m_short - (1.0 - d) / (1.0 - m_short) + rr_short_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_short) + cf_d_ate = (np.mean(1 / (m_long * (1 - m_long))) - np.mean(1 / (m_short * (1 - m_short)))) / np.mean( + 1 / (m_long * (1 - m_long)) + ) + cf_d_atte = (np.mean(propensity_ratio_long) - np.mean(propensity_ratio_short)) / np.mean(propensity_ratio_long) + if (beta_a == 0) | (gamma_a == 0): + rho_ate = 0.0 + rho_atte = 0.0 + else: + rho_ate = np.corrcoef((g_long - g_short), (rr_long_ate - rr_short_ate))[0, 1] + rho_atte = np.corrcoef((g_long - g_short), (rr_long_atte - rr_short_atte))[0, 1] + oracle_values = { + "g_long": g_long, + "g_short": g_short, + "m_long": m_long, + "m_short": m_short, + "gamma_a": gamma_a, + "beta_a": beta_a, + "a": a, + "y_0": y_0, + "y_1": y_1, + "z": z, + "cf_y": cf_y, + "cf_d_ate": cf_d_ate, + "cf_d_atte": cf_d_atte, + "rho_ate": rho_ate, + "rho_atte": rho_atte, + } + res_dict = {"x": x, "y": y, "d": d, "oracle_values": oracle_values} + return res_dict diff --git a/doubleml/irm/datasets/dgp_heterogeneous_data.py b/doubleml/irm/datasets/dgp_heterogeneous_data.py new file mode 100644 index 000000000..0f1a1b15c --- /dev/null +++ b/doubleml/irm/datasets/dgp_heterogeneous_data.py @@ -0,0 +1,114 @@ +import numpy as np +import pandas as pd + + +def make_heterogeneous_data(n_obs=200, p=30, support_size=5, n_x=1, binary_treatment=False): + """ + Creates a simple synthetic example for heterogeneous treatment effects. + The data generating process is based on the Monte Carlo simulation from Oprescu et al. (2019). + + The data is generated as + + .. math:: + + Y_i & = \\theta_0(X_i)D_i + \\langle X_i,\\gamma_0\\rangle + \\epsilon_i + + D_i & = \\langle X_i,\\beta_0\\rangle + \\eta_i, + + where :math:`X_i\\sim\\mathcal{U}[0,1]^{p}` and :math:`\\epsilon_i,\\eta_i + \\sim\\mathcal{U}[-1,1]`. + If the treatment is set to be binary, the treatment is generated as + + .. math:: + D_i = 1\\{\\langle X_i,\\beta_0\\rangle \\ge \\eta_i\\}. + + The coefficient vectors :math:`\\gamma_0` and :math:`\\beta_0` both have small random (identical) support + which values are drawn independently from :math:`\\mathcal{U}[0,1]` and :math:`\\mathcal{U}[0,0.3]`. + Further, :math:`\\theta_0(x)` defines the conditional treatment effect, which is defined differently depending + on the dimension of :math:`x`. + + If the heterogeneity is univariate the conditional treatment effect takes the following form + + .. math:: + \\theta_0(x) = \\exp(2x_0) + 3\\sin(4x_0), + + whereas for the two-dimensional case the conditional treatment effect is defined as + + .. math:: + \\theta_0(x) = \\exp(2x_0) + 3\\sin(4x_1). + + Parameters + ---------- + n_obs : int + Number of observations to simulate. + Default is ``200``. + + p : int + Dimension of covariates. + Default is ``30``. + + support_size : int + Number of relevant (confounding) covariates. + Default is ``5``. + + n_x : int + Dimension of the heterogeneity. Can be either ``1`` or ``2``. + Default is ``1``. + + binary_treatment : bool + Indicates whether the treatment is binary. + Default is ``False``. + + Returns + ------- + res_dict : dictionary + Dictionary with entries ``data``, ``effects``, ``treatment_effect``. + + """ + # simple input checks + assert n_x in [1, 2], "n_x must be either 1 or 2." + assert support_size <= p, "support_size must be smaller than p." + assert isinstance(binary_treatment, bool), "binary_treatment must be a boolean." + + # define treatment effects + if n_x == 1: + + def treatment_effect(x): + return np.exp(2 * x[:, 0]) + 3 * np.sin(4 * x[:, 0]) + + else: + assert n_x == 2 + + # redefine treatment effect + def treatment_effect(x): + return np.exp(2 * x[:, 0]) + 3 * np.sin(4 * x[:, 1]) + + # Outcome support and coefficients + support_y = np.random.choice(np.arange(p), size=support_size, replace=False) + coefs_y = np.random.uniform(0, 1, size=support_size) + # treatment support and coefficients + support_d = support_y + coefs_d = np.random.uniform(0, 0.3, size=support_size) + + # noise + epsilon = np.random.uniform(-1, 1, size=n_obs) + eta = np.random.uniform(-1, 1, size=n_obs) + + # Generate controls, covariates, treatments and outcomes + x = np.random.uniform(0, 1, size=(n_obs, p)) + # Heterogeneous treatment effects + te = treatment_effect(x) + if binary_treatment: + d = 1.0 * (np.dot(x[:, support_d], coefs_d) >= eta) + else: + d = np.dot(x[:, support_d], coefs_d) + eta + y = te * d + np.dot(x[:, support_y], coefs_y) + epsilon + + # Now we build the dataset + y_df = pd.DataFrame({"y": y}) + d_df = pd.DataFrame({"d": d}) + x_df = pd.DataFrame(data=x, index=np.arange(x.shape[0]), columns=[f"X_{i}" for i in range(x.shape[1])]) + + data = pd.concat([y_df, d_df, x_df], axis=1) + res_dict = {"data": data, "effects": te, "treatment_effect": treatment_effect} + return res_dict diff --git a/doubleml/irm/datasets/dgp_iivm_data.py b/doubleml/irm/datasets/dgp_iivm_data.py new file mode 100644 index 000000000..e8c1130f2 --- /dev/null +++ b/doubleml/irm/datasets/dgp_iivm_data.py @@ -0,0 +1,102 @@ +import numpy as np +import pandas as pd +from scipy.linalg import toeplitz + +from doubleml.data import DoubleMLData +from doubleml.utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_data_alias + +_array_alias = _get_array_alias() +_data_frame_alias = _get_data_frame_alias() +_dml_data_alias = _get_dml_data_alias() + + +def make_iivm_data(n_obs=500, dim_x=20, theta=1.0, alpha_x=0.2, return_type="DoubleMLData"): + """ + Generates data from a interactive IV regression (IIVM) model. + The data generating process is defined as + + .. math:: + + d_i &= 1\\left\\lbrace \\alpha_x Z + v_i > 0 \\right\\rbrace, + + y_i &= \\theta d_i + x_i' \\beta + u_i, + + with :math:`Z \\sim \\text{Bernoulli}(0.5)` and + + .. math:: + + \\left(\\begin{matrix} u_i \\\\ v_i \\end{matrix} \\right) \\sim + \\mathcal{N}\\left(0, \\left(\\begin{matrix} 1 & 0.3 \\\\ 0.3 & 1 \\end{matrix} \\right) \\right). + + The covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries + :math:`\\Sigma_{kj} = 0.5^{|j-k|}` and :math:`\\beta` is a `dim_x`-vector with entries + :math:`\\beta_j=\\frac{1}{j^2}`. + + The data generating process is inspired by a process used in the simulation experiment of Farbmacher, Gruber and + Klaassen (2020). + + Parameters + ---------- + n_obs : + The number of observations to simulate. + dim_x : + The number of covariates. + theta : + The value of the causal parameter. + alpha_x : + The value of the parameter :math:`\\alpha_x`. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z)``. + + References + ---------- + Farbmacher, H., Guber, R. and Klaaßen, S. (2020). Instrument Validity Tests with Causal Forests. MEA Discussion + Paper No. 13-2020. Available at SSRN: http://dx.doi.org/10.2139/ssrn.3619201. + """ + # inspired by https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3619201 + xx = np.random.multivariate_normal( + np.zeros(2), + np.array([[1.0, 0.3], [0.3, 1.0]]), + size=[ + n_obs, + ], + ) + u = xx[:, 0] + v = xx[:, 1] + + cov_mat = toeplitz([np.power(0.5, k) for k in range(dim_x)]) + x = np.random.multivariate_normal( + np.zeros(dim_x), + cov_mat, + size=[ + n_obs, + ], + ) + + beta = [1 / (k**2) for k in range(1, dim_x + 1)] + + z = np.random.binomial( + p=0.5, + n=1, + size=[ + n_obs, + ], + ) + d = 1.0 * (alpha_x * z + v > 0) + y = d * theta + np.dot(x, beta) + u + + if return_type in _array_alias: + return x, y, d, z + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d, z)), columns=x_cols + ["y", "d", "z"]) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, "y", "d", x_cols, "z") + else: + raise ValueError("Invalid return_type.") diff --git a/doubleml/irm/datasets/dgp_irm_data.py b/doubleml/irm/datasets/dgp_irm_data.py new file mode 100644 index 000000000..973902ec6 --- /dev/null +++ b/doubleml/irm/datasets/dgp_irm_data.py @@ -0,0 +1,103 @@ +import numpy as np +import pandas as pd +from scipy.linalg import toeplitz + +from doubleml.data import DoubleMLData +from doubleml.utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_data_alias + +_array_alias = _get_array_alias() +_data_frame_alias = _get_data_frame_alias() +_dml_data_alias = _get_dml_data_alias() + + +def make_irm_data(n_obs=500, dim_x=20, theta=0, R2_d=0.5, R2_y=0.5, return_type="DoubleMLData"): + """ + Generates data from a interactive regression (IRM) model. + The data generating process is defined as + + .. math:: + + d_i &= 1\\left\\lbrace \\frac{\\exp(c_d x_i' \\beta)}{1+\\exp(c_d x_i' \\beta)} > v_i \\right\\rbrace, & &v_i + \\sim \\mathcal{U}(0,1), + + y_i &= \\theta d_i + c_y x_i' \\beta d_i + \\zeta_i, & &\\zeta_i \\sim \\mathcal{N}(0,1), + + with covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries + :math:`\\Sigma_{kj} = 0.5^{|j-k|}`. + :math:`\\beta` is a `dim_x`-vector with entries :math:`\\beta_j=\\frac{1}{j^2}` and the constants :math:`c_y` and + :math:`c_d` are given by + + .. math:: + + c_y = \\sqrt{\\frac{R_y^2}{(1-R_y^2) \\beta' \\Sigma \\beta}}, \\qquad c_d = + \\sqrt{\\frac{(\\pi^2 /3) R_d^2}{(1-R_d^2) \\beta' \\Sigma \\beta}}. + + The data generating process is inspired by a process used in the simulation experiment (see Appendix P) of Belloni + et al. (2017). + + Parameters + ---------- + n_obs : + The number of observations to simulate. + dim_x : + The number of covariates. + theta : + The value of the causal parameter. + R2_d : + The value of the parameter :math:`R_d^2`. + R2_y : + The value of the parameter :math:`R_y^2`. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``. + + References + ---------- + Belloni, A., Chernozhukov, V., Fernández‐Val, I. and Hansen, C. (2017). Program Evaluation and Causal Inference With + High‐Dimensional Data. Econometrica, 85: 233-298. + """ + # inspired by https://onlinelibrary.wiley.com/doi/abs/10.3982/ECTA12723, see suplement + v = np.random.uniform( + size=[ + n_obs, + ] + ) + zeta = np.random.standard_normal( + size=[ + n_obs, + ] + ) + + cov_mat = toeplitz([np.power(0.5, k) for k in range(dim_x)]) + x = np.random.multivariate_normal( + np.zeros(dim_x), + cov_mat, + size=[ + n_obs, + ], + ) + + beta = [1 / (k**2) for k in range(1, dim_x + 1)] + b_sigma_b = np.dot(np.dot(cov_mat, beta), beta) + c_y = np.sqrt(R2_y / ((1 - R2_y) * b_sigma_b)) + c_d = np.sqrt(np.pi**2 / 3.0 * R2_d / ((1 - R2_d) * b_sigma_b)) + + xx = np.exp(np.dot(x, np.multiply(beta, c_d))) + d = 1.0 * ((xx / (1 + xx)) > v) + + y = d * theta + d * np.dot(x, np.multiply(beta, c_y)) + zeta + + if return_type in _array_alias: + return x, y, d + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d)), columns=x_cols + ["y", "d"]) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, "y", "d", x_cols) + else: + raise ValueError("Invalid return_type.") diff --git a/doubleml/irm/datasets/dgp_irm_data_discrete_treatments.py b/doubleml/irm/datasets/dgp_irm_data_discrete_treatments.py new file mode 100644 index 000000000..af621c9da --- /dev/null +++ b/doubleml/irm/datasets/dgp_irm_data_discrete_treatments.py @@ -0,0 +1,164 @@ +import numpy as np +from scipy.linalg import toeplitz + + +def make_irm_data_discrete_treatments(n_obs=200, n_levels=3, linear=False, random_state=None, **kwargs): + """ + Generates data from a interactive regression (IRM) model with multiple treatment levels (based on an + underlying continous treatment). + + The data generating process is defined as follows (similar to the Monte Carlo simulation used + in Sant'Anna and Zhao (2020)). + + Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` corresponds + to the identity matrix. + Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, + where + + .. math:: + + \\tilde{Z}_1 &= \\exp(0.5 \\cdot X_1) + + \\tilde{Z}_2 &= 10 + X_2/(1 + \\exp(X_1)) + + \\tilde{Z}_3 &= (0.6 + X_1 \\cdot X_3 / 25)^3 + + \\tilde{Z}_4 &= (20 + X_2 + X_4)^2 + + \\tilde{Z}_5 &= X_5. + + A continuous treatment :math:`D_{\\text{cont}}` is generated as + + .. math:: + + D_{\\text{cont}} = \\xi (-Z_1 + 0.5 Z_2 - 0.25 Z_3 - 0.1 Z_4) + \\varepsilon_D, + + where :math:`\\varepsilon_D \\sim \\mathcal{N}(0,1)` and :math:`\\xi=0.3`. The corresponding treatment + effect is defined as + + .. math:: + + \\theta (d) = 0.1 \\exp(d) + 10 \\sin(0.7 d) + 2 d - 0.2 d^2. + + Based on the continous treatment, a discrete treatment :math:`D` is generated as with a baseline level of + :math:`D=0` and additional levels based on the quantiles of :math:`D_{\\text{cont}}`. The number of levels + is defined by :math:`n_{\\text{levels}}`. Each level is chosen to have the same probability of being selected. + + The potential outcomes are defined as + + .. math:: + + Y(0) &= 210 + 27.4 Z_1 + 13.7 (Z_2 + Z_3 + Z_4) + \\varepsilon_Y + + Y(1) &= \\theta (D_{\\text{cont}}) 1\\{D_{\\text{cont}} > 0\\} + Y(0), + + where :math:`\\varepsilon_Y \\sim \\mathcal{N}(0,5)`. Further, the observed outcome is defined as + + .. math:: + + Y = Y(1) 1\\{D > 0\\} + Y(0) 1\\{D = 0\\}. + + The data is returned as a dictionary with the entries ``x``, ``y``, ``d`` and ``oracle_values``. + + Parameters + ---------- + n_obs : int + The number of observations to simulate. + Default is ``200``. + + n_levels : int + The number of treatment levels. + Default is ``3``. + + linear : bool + Indicates whether the true underlying regression is linear. + Default is ``False``. + + random_state : int + Random seed for reproducibility. + Default is ``42``. + + Returns + ------- + res_dict : dictionary + Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``. + The oracle values contain the continuous treatment, the level bounds, the potential level, ITE + and the potential outcome without treatment. + + """ + if random_state is not None: + np.random.seed(random_state) + xi = kwargs.get("xi", 0.3) + c = kwargs.get("c", 0.0) + dim_x = kwargs.get("dim_x", 5) + + if not isinstance(n_levels, int): + raise ValueError("n_levels must be an integer.") + if n_levels < 2: + raise ValueError("n_levels must be at least 2.") + + # observed covariates + cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) + x = np.random.multivariate_normal( + np.zeros(dim_x), + cov_mat, + size=[ + n_obs, + ], + ) + + def f_reg(w): + res = 210 + 27.4 * w[:, 0] + 13.7 * (w[:, 1] + w[:, 2] + w[:, 3]) + return res + + def f_treatment(w, xi): + res = xi * (-w[:, 0] + 0.5 * w[:, 1] - 0.25 * w[:, 2] - 0.1 * w[:, 3]) + return res + + def treatment_effect(d, scale=15): + return scale * (1 / (1 + np.exp(-d - 1.2 * np.cos(d)))) - 2 + + z_tilde_1 = np.exp(0.5 * x[:, 0]) + z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) + z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 + z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 + + z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, x[:, 4:])) + z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) + + # error terms + var_eps_y = 5 + eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs) + var_eps_d = 1 + eps_d = np.random.normal(loc=0, scale=np.sqrt(var_eps_d), size=n_obs) + + if linear: + g = f_reg(x) + m = f_treatment(x, xi) + else: + assert not linear + g = f_reg(z) + m = f_treatment(z, xi) + + cont_d = m + eps_d + level_bounds = np.quantile(cont_d, q=np.linspace(0, 1, n_levels + 1)) + potential_level = sum([1.0 * (cont_d >= bound) for bound in level_bounds[1:-1]]) + 1 + eta = np.random.uniform(0, 1, size=n_obs) + d = 1.0 * (eta >= 1 / n_levels) * potential_level + + ite = treatment_effect(cont_d) + y0 = g + eps_y + # only treated for d > 0 compared to the baseline + y = ite * (d > 0) + y0 + + oracle_values = { + "cont_d": cont_d, + "level_bounds": level_bounds, + "potential_level": potential_level, + "ite": ite, + "y0": y0, + } + + resul_dict = {"x": x, "y": y, "d": d, "oracle_values": oracle_values} + + return resul_dict diff --git a/doubleml/irm/datasets/dgp_ssm_data.py b/doubleml/irm/datasets/dgp_ssm_data.py new file mode 100644 index 000000000..6a6a5bee8 --- /dev/null +++ b/doubleml/irm/datasets/dgp_ssm_data.py @@ -0,0 +1,102 @@ +import numpy as np +import pandas as pd +from scipy.linalg import toeplitz + +from doubleml.data import DoubleMLData +from doubleml.utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_data_alias + +_array_alias = _get_array_alias() +_data_frame_alias = _get_data_frame_alias() +_dml_data_alias = _get_dml_data_alias() + + +def make_ssm_data(n_obs=8000, dim_x=100, theta=1, mar=True, return_type="DoubleMLData"): + """ + Generates data from a sample selection model (SSM). + The data generating process is defined as + + .. math:: + + y_i &= \\theta d_i + x_i' \\beta d_i + u_i, + + s_i &= 1\\left\\lbrace d_i + \\gamma z_i + x_i' \\beta + v_i > 0 \\right\\rbrace, + + d_i &= 1\\left\\lbrace x_i' \\beta + w_i > 0 \\right\\rbrace, + + with Y being observed if :math:`s_i = 1` and covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma^2_x)`, where + :math:`\\Sigma^2_x` is a matrix with entries + :math:`\\Sigma_{kj} = 0.5^{|j-k|}`. + :math:`\\beta` is a `dim_x`-vector with entries :math:`\\beta_j=\\frac{0.4}{j^2}` + :math:`z_i \\sim \\mathcal{N}(0, 1)`, + :math:`(u_i,v_i) \\sim \\mathcal{N}(0, \\Sigma^2_{u,v})`, + :math:`w_i \\sim \\mathcal{N}(0, 1)`. + + + The data generating process is inspired by a process used in the simulation study (see Appendix E) of Bia, + Huber and Lafférs (2023). + + Parameters + ---------- + n_obs : + The number of observations to simulate. + dim_x : + The number of covariates. + theta : + The value of the causal parameter. + mar: + Boolean. Indicates whether missingness at random holds. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z, s)``. + + References + ---------- + Michela Bia, Martin Huber & Lukáš Lafférs (2023) Double Machine Learning for Sample Selection Models, + Journal of Business & Economic Statistics, DOI: 10.1080/07350015.2023.2271071 + """ + if mar: + sigma = np.array([[1, 0], [0, 1]]) + gamma = 0 + else: + sigma = np.array([[1, 0.8], [0.8, 1]]) + gamma = 1 + + e = np.random.multivariate_normal(mean=[0, 0], cov=sigma, size=n_obs).T + + cov_mat = toeplitz([np.power(0.5, k) for k in range(dim_x)]) + x = np.random.multivariate_normal( + np.zeros(dim_x), + cov_mat, + size=[ + n_obs, + ], + ) + + beta = [0.4 / (k**2) for k in range(1, dim_x + 1)] + + d = np.where(np.dot(x, beta) + np.random.randn(n_obs) > 0, 1, 0) + z = np.random.randn(n_obs) + s = np.where(np.dot(x, beta) + d + gamma * z + e[0] > 0, 1, 0) + + y = np.dot(x, beta) + theta * d + e[1] + y[s == 0] = 0 + + if return_type in _array_alias: + return x, y, d, z, s + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] + if mar: + data = pd.DataFrame(np.column_stack((x, y, d, s)), columns=x_cols + ["y", "d", "s"]) + else: + data = pd.DataFrame(np.column_stack((x, y, d, z, s)), columns=x_cols + ["y", "d", "z", "s"]) + if return_type in _data_frame_alias: + return data + else: + if mar: + return DoubleMLData(data, "y", "d", x_cols, None, None, "s") + return DoubleMLData(data, "y", "d", x_cols, "z", None, "s") + else: + raise ValueError("Invalid return_type.") diff --git a/doubleml/irm/iivm.py b/doubleml/irm/iivm.py index a43c0a035..70c09cdee 100644 --- a/doubleml/irm/iivm.py +++ b/doubleml/irm/iivm.py @@ -80,7 +80,7 @@ class DoubleMLIIVM(LinearScoreMixin, DoubleML): -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_iivm_data + >>> from doubleml.irm.datasets import make_iivm_data >>> from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier >>> np.random.seed(3141) >>> ml_g = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) diff --git a/doubleml/irm/irm.py b/doubleml/irm/irm.py index 9bf5ed35f..10f6377c7 100644 --- a/doubleml/irm/irm.py +++ b/doubleml/irm/irm.py @@ -84,7 +84,7 @@ class DoubleMLIRM(LinearScoreMixin, DoubleML): -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_irm_data + >>> from doubleml.irm.datasets import make_irm_data >>> from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier >>> np.random.seed(3141) >>> ml_g = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) diff --git a/doubleml/irm/lpq.py b/doubleml/irm/lpq.py index c98e8fa2d..f46fb38c0 100644 --- a/doubleml/irm/lpq.py +++ b/doubleml/irm/lpq.py @@ -83,7 +83,7 @@ class DoubleMLLPQ(NonLinearScoreMixin, DoubleML): -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_iivm_data + >>> from doubleml.irm.datasets import make_iivm_data >>> from sklearn.ensemble import RandomForestClassifier >>> np.random.seed(3141) >>> ml_g = RandomForestClassifier(n_estimators=100, max_features=20, max_depth=10, min_samples_leaf=2) diff --git a/doubleml/irm/pq.py b/doubleml/irm/pq.py index f64dc4719..d0425845b 100644 --- a/doubleml/irm/pq.py +++ b/doubleml/irm/pq.py @@ -90,7 +90,7 @@ class DoubleMLPQ(NonLinearScoreMixin, DoubleML): -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_irm_data + >>> from doubleml.irm.datasets import make_irm_data >>> from sklearn.ensemble import RandomForestClassifier >>> np.random.seed(3141) >>> ml_g = RandomForestClassifier(n_estimators=100, max_features=20, max_depth=10, min_samples_leaf=2) diff --git a/doubleml/irm/qte.py b/doubleml/irm/qte.py index 68b91a9a7..a2c803a34 100644 --- a/doubleml/irm/qte.py +++ b/doubleml/irm/qte.py @@ -72,7 +72,7 @@ class DoubleMLQTE: -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_irm_data + >>> from doubleml.irm.datasets import make_irm_data >>> from sklearn.ensemble import RandomForestClassifier >>> np.random.seed(3141) >>> ml_g = RandomForestClassifier(n_estimators=100, max_features=20, max_depth=10, min_samples_leaf=2) @@ -499,7 +499,7 @@ def set_sample_splitting(self, all_smpls, all_smpls_cluster=None): -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_plr_CCDDHNR2018 + >>> from doubleml.plm.datasets import make_plr_CCDDHNR2018 >>> from sklearn.ensemble import RandomForestRegressor >>> from sklearn.base import clone >>> np.random.seed(3141) diff --git a/doubleml/irm/tests/conftest.py b/doubleml/irm/tests/conftest.py index 1cf1d5250..0a3d4db8a 100644 --- a/doubleml/irm/tests/conftest.py +++ b/doubleml/irm/tests/conftest.py @@ -4,7 +4,7 @@ from scipy.linalg import toeplitz from sklearn.datasets import make_spd_matrix -from doubleml.datasets import make_iivm_data, make_irm_data +from doubleml.irm.datasets import make_iivm_data, make_irm_data def _g(x): diff --git a/doubleml/irm/tests/test_apo.py b/doubleml/irm/tests/test_apo.py index df4ec284f..7558b7c17 100644 --- a/doubleml/irm/tests/test_apo.py +++ b/doubleml/irm/tests/test_apo.py @@ -8,7 +8,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression import doubleml as dml -from doubleml.datasets import make_irm_data, make_irm_data_discrete_treatments +from doubleml.irm.datasets import make_irm_data, make_irm_data_discrete_treatments from ...tests._utils import draw_smpls from ._utils_apo_manual import boot_apo, fit_apo, fit_sensitivity_elements_apo diff --git a/doubleml/irm/tests/test_apo_exceptions.py b/doubleml/irm/tests/test_apo_exceptions.py index cfb6e93b5..e643efcaf 100644 --- a/doubleml/irm/tests/test_apo_exceptions.py +++ b/doubleml/irm/tests/test_apo_exceptions.py @@ -5,7 +5,7 @@ from sklearn.linear_model import Lasso, LogisticRegression from doubleml import DoubleMLAPO, DoubleMLData -from doubleml.datasets import make_iivm_data, make_irm_data, make_irm_data_discrete_treatments +from doubleml.irm.datasets import make_iivm_data, make_irm_data, make_irm_data_discrete_treatments n = 100 data_apo = make_irm_data_discrete_treatments(n_obs=n) diff --git a/doubleml/irm/tests/test_apo_external_predictions.py b/doubleml/irm/tests/test_apo_external_predictions.py index 2bbe50e81..246ef0217 100644 --- a/doubleml/irm/tests/test_apo_external_predictions.py +++ b/doubleml/irm/tests/test_apo_external_predictions.py @@ -6,7 +6,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression from doubleml import DoubleMLAPO, DoubleMLData -from doubleml.datasets import make_irm_data_discrete_treatments +from doubleml.irm.datasets import make_irm_data_discrete_treatments from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor from ...tests._utils import draw_smpls diff --git a/doubleml/irm/tests/test_apos.py b/doubleml/irm/tests/test_apos.py index 746cb63c9..55a48ced8 100644 --- a/doubleml/irm/tests/test_apos.py +++ b/doubleml/irm/tests/test_apos.py @@ -6,7 +6,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression import doubleml as dml -from doubleml.datasets import make_irm_data, make_irm_data_discrete_treatments +from doubleml.irm.datasets import make_irm_data, make_irm_data_discrete_treatments from ...tests._utils import confint_manual from ._utils_apos_manual import boot_apos, fit_apos diff --git a/doubleml/irm/tests/test_apos_classfier.py b/doubleml/irm/tests/test_apos_classfier.py index 06fdc3085..f9cfc10c8 100644 --- a/doubleml/irm/tests/test_apos_classfier.py +++ b/doubleml/irm/tests/test_apos_classfier.py @@ -6,7 +6,7 @@ from sklearn.linear_model import LogisticRegression import doubleml as dml -from doubleml.datasets import make_irm_data_discrete_treatments +from doubleml.irm.datasets import make_irm_data_discrete_treatments from ...tests._utils import confint_manual from ._utils_apos_manual import boot_apos, fit_apos diff --git a/doubleml/irm/tests/test_apos_exceptions.py b/doubleml/irm/tests/test_apos_exceptions.py index c309b7e24..f1c9b3d6e 100644 --- a/doubleml/irm/tests/test_apos_exceptions.py +++ b/doubleml/irm/tests/test_apos_exceptions.py @@ -4,7 +4,7 @@ from sklearn.linear_model import Lasso, LogisticRegression from doubleml import DoubleMLAPOS, DoubleMLData -from doubleml.datasets import make_iivm_data, make_irm_data_discrete_treatments +from doubleml.irm.datasets import make_iivm_data, make_irm_data_discrete_treatments n = 100 data = make_irm_data_discrete_treatments(n_obs=n) diff --git a/doubleml/irm/tests/test_apos_external_predictions.py b/doubleml/irm/tests/test_apos_external_predictions.py index 9e97de072..ed4323adf 100644 --- a/doubleml/irm/tests/test_apos_external_predictions.py +++ b/doubleml/irm/tests/test_apos_external_predictions.py @@ -6,7 +6,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression from doubleml import DoubleMLAPOS, DoubleMLData -from doubleml.datasets import make_irm_data_discrete_treatments +from doubleml.irm.datasets import make_irm_data_discrete_treatments from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor from ...tests._utils import draw_smpls diff --git a/doubleml/irm/tests/test_apos_weighted_scores.py b/doubleml/irm/tests/test_apos_weighted_scores.py index ea612decf..6d0a7f653 100644 --- a/doubleml/irm/tests/test_apos_weighted_scores.py +++ b/doubleml/irm/tests/test_apos_weighted_scores.py @@ -6,7 +6,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression import doubleml as dml -from doubleml.datasets import make_irm_data_discrete_treatments +from doubleml.irm.datasets import make_irm_data_discrete_treatments @pytest.fixture( diff --git a/doubleml/irm/tests/test_iivm_external_predictions.py b/doubleml/irm/tests/test_iivm_external_predictions.py index 7f4626e95..d71d2bb51 100644 --- a/doubleml/irm/tests/test_iivm_external_predictions.py +++ b/doubleml/irm/tests/test_iivm_external_predictions.py @@ -5,7 +5,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression from doubleml import DoubleMLData, DoubleMLIIVM -from doubleml.datasets import make_iivm_data +from doubleml.irm.datasets import make_iivm_data from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor diff --git a/doubleml/irm/tests/test_irm.py b/doubleml/irm/tests/test_irm.py index f99f2253b..856c7f59d 100644 --- a/doubleml/irm/tests/test_irm.py +++ b/doubleml/irm/tests/test_irm.py @@ -8,7 +8,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression import doubleml as dml -from doubleml.datasets import make_irm_data +from doubleml.irm.datasets import make_irm_data from doubleml.utils.resampling import DoubleMLResampling from ...tests._utils import draw_smpls diff --git a/doubleml/irm/tests/test_irm_external_predictions.py b/doubleml/irm/tests/test_irm_external_predictions.py index dabf6c0eb..5d0412d5f 100644 --- a/doubleml/irm/tests/test_irm_external_predictions.py +++ b/doubleml/irm/tests/test_irm_external_predictions.py @@ -5,7 +5,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression from doubleml import DoubleMLData, DoubleMLIRM -from doubleml.datasets import make_irm_data +from doubleml.irm.datasets import make_irm_data from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor diff --git a/doubleml/irm/tests/test_lpq_external_predictions.py b/doubleml/irm/tests/test_lpq_external_predictions.py index 66f2ece6e..48cb42f57 100644 --- a/doubleml/irm/tests/test_lpq_external_predictions.py +++ b/doubleml/irm/tests/test_lpq_external_predictions.py @@ -5,7 +5,7 @@ from sklearn.linear_model import LogisticRegression from doubleml import DoubleMLData, DoubleMLLPQ -from doubleml.datasets import make_iivm_data +from doubleml.irm.datasets import make_iivm_data from doubleml.utils import DMLDummyClassifier from ...tests._utils import draw_smpls diff --git a/doubleml/irm/tests/test_pq_external_predictions.py b/doubleml/irm/tests/test_pq_external_predictions.py index 28f8ec660..9674c4648 100644 --- a/doubleml/irm/tests/test_pq_external_predictions.py +++ b/doubleml/irm/tests/test_pq_external_predictions.py @@ -5,7 +5,7 @@ from sklearn.linear_model import LogisticRegression from doubleml import DoubleMLData, DoubleMLPQ -from doubleml.datasets import make_irm_data +from doubleml.irm.datasets import make_irm_data from doubleml.utils import DMLDummyClassifier from ...tests._utils import draw_smpls diff --git a/doubleml/irm/tests/test_qte.py b/doubleml/irm/tests/test_qte.py index 0557c85b8..7fcbeec23 100644 --- a/doubleml/irm/tests/test_qte.py +++ b/doubleml/irm/tests/test_qte.py @@ -8,7 +8,7 @@ from sklearn.linear_model import LogisticRegression import doubleml as dml -from doubleml.datasets import make_irm_data +from doubleml.irm.datasets import make_irm_data from ...tests._utils import confint_manual, draw_smpls from ...utils._estimation import _default_kde diff --git a/doubleml/irm/tests/test_qte_exceptions.py b/doubleml/irm/tests/test_qte_exceptions.py index 9f94f5d4a..f4e951107 100644 --- a/doubleml/irm/tests/test_qte_exceptions.py +++ b/doubleml/irm/tests/test_qte_exceptions.py @@ -6,7 +6,7 @@ from doubleml import DoubleMLData, DoubleMLQTE from doubleml.data.base_data import DoubleMLBaseData -from doubleml.datasets import make_irm_data +from doubleml.irm.datasets import make_irm_data np.random.seed(42) n = 100 diff --git a/doubleml/irm/tests/test_ssm_exceptions.py b/doubleml/irm/tests/test_ssm_exceptions.py index 6ff276e32..50b082ec5 100644 --- a/doubleml/irm/tests/test_ssm_exceptions.py +++ b/doubleml/irm/tests/test_ssm_exceptions.py @@ -6,7 +6,7 @@ from doubleml import DoubleMLSSM from doubleml.data.base_data import DoubleMLBaseData -from doubleml.datasets import make_ssm_data +from doubleml.irm.datasets import make_ssm_data np.random.seed(3141) n = 100 diff --git a/doubleml/plm/datasets/__init__.py b/doubleml/plm/datasets/__init__.py new file mode 100644 index 000000000..f8928902e --- /dev/null +++ b/doubleml/plm/datasets/__init__.py @@ -0,0 +1,20 @@ +""" +The :mod:`doubleml.plm.datasets` module implements data generating processes for partially linear models. +""" + +from .dgp_plr_CCDDHNR2018 import make_plr_CCDDHNR2018 +from .dgp_plr_turrell2018 import make_plr_turrell2018 +from .dgp_confounded_plr_data import make_confounded_plr_data +from .dgp_pliv_CHS2015 import make_pliv_CHS2015 +from .dgp_pliv_multiway_cluster_CKMS2021 import make_pliv_multiway_cluster_CKMS2021 +from ._make_pliv_data import _make_pliv_data + + +__all__ = [ + "make_plr_CCDDHNR2018", + "make_plr_turrell2018", + "make_confounded_plr_data", + "make_pliv_CHS2015", + "make_pliv_multiway_cluster_CKMS2021", + "_make_pliv_data", +] diff --git a/doubleml/plm/datasets/_make_pliv_data.py b/doubleml/plm/datasets/_make_pliv_data.py new file mode 100644 index 000000000..deb7cc539 --- /dev/null +++ b/doubleml/plm/datasets/_make_pliv_data.py @@ -0,0 +1,70 @@ +""" +Helper function for partially linear IV data generation. +""" + +import numpy as np +import pandas as pd +from sklearn.datasets import make_spd_matrix + +from doubleml.data import DoubleMLData +from doubleml.utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_data_alias + +_array_alias = _get_array_alias() +_data_frame_alias = _get_data_frame_alias() +_dml_data_alias = _get_dml_data_alias() + + +def _g(x): + return np.power(np.sin(x), 2) + + +def _m(x, nu=0.0, gamma=1.0): + return 0.5 / np.pi * (np.sinh(gamma)) / (np.cosh(gamma) - np.cos(x - nu)) + + +def _make_pliv_data(n_obs=100, dim_x=20, theta=0.5, gamma_z=0.4, return_type="DoubleMLData"): + b = [1 / k for k in range(1, dim_x + 1)] + sigma = make_spd_matrix(dim_x) + + x = np.random.multivariate_normal( + np.zeros(dim_x), + sigma, + size=[ + n_obs, + ], + ) + G = _g(np.dot(x, b)) + # instrument + z = _m(np.dot(x, b)) + np.random.standard_normal( + size=[ + n_obs, + ] + ) + # treatment + M = _m(gamma_z * z + np.dot(x, b)) + d = M + np.random.standard_normal( + size=[ + n_obs, + ] + ) + y = ( + np.dot(theta, d) + + G + + np.random.standard_normal( + size=[ + n_obs, + ] + ) + ) + + if return_type in _array_alias: + return x, y, d, z + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d, z)), columns=x_cols + ["y", "d", "z"]) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, "y", "d", x_cols, "z") + else: + raise ValueError("Invalid return_type.") diff --git a/doubleml/plm/datasets/dgp_confounded_plr_data.py b/doubleml/plm/datasets/dgp_confounded_plr_data.py new file mode 100644 index 000000000..794e3db12 --- /dev/null +++ b/doubleml/plm/datasets/dgp_confounded_plr_data.py @@ -0,0 +1,171 @@ +import numpy as np +from scipy.linalg import toeplitz +from scipy.optimize import minimize_scalar + + +def make_confounded_plr_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04, **kwargs): + """ + Generates counfounded data from an partially linear regression model. + + The data generating process is defined as follows (similar to the Monte Carlo simulation used + in Sant'Anna and Zhao (2020)). Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, + where :math:`\\Sigma` is a matrix with entries + :math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is :math:`c = 0`, corresponding to the identity matrix. + Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, + where + + .. math:: + + \\tilde{Z}_1 &= \\exp(0.5 \\cdot X_1) + + \\tilde{Z}_2 &= 10 + X_2/(1 + \\exp(X_1)) + + \\tilde{Z}_3 &= (0.6 + X_1 \\cdot X_3 / 25)^3 + + \\tilde{Z}_4 &= (20 + X_2 + X_4)^2. + + Additionally, generate a confounder :math:`A \\sim \\mathcal{U}[-1, 1]`. + At first, define the treatment as + + .. math:: + + D = -Z_1 + 0.5 \\cdot Z_2 - 0.25 \\cdot Z_3 - 0.1 \\cdot Z_4 + \\gamma_A \\cdot A + \\varepsilon_D + + and with :math:`\\varepsilon \\sim \\mathcal{N}(0,1)`. + Since :math:`A` is independent of :math:`X`, the long and short form of the treatment regression are given as + + .. math:: + + E[D|X,A] = -Z_1 + 0.5 \\cdot Z_2 - 0.25 \\cdot Z_3 - 0.1 \\cdot Z_4 + \\gamma_A \\cdot A + + E[D|X] = -Z_1 + 0.5 \\cdot Z_2 - 0.25 \\cdot Z_3 - 0.1 \\cdot Z_4. + + Further, generate the outcome of interest :math:`Y` as + + .. math:: + + Y &= \\theta \\cdot D + g(Z) + \\beta_A \\cdot A + \\varepsilon + + g(Z) &= 210 + 27.4 \\cdot Z_1 +13.7 \\cdot (Z_2 + Z_3 + Z_4) + + where :math:`\\varepsilon \\sim \\mathcal{N}(0,5)`. + This implies an average treatment effect of :math:`\\theta`. Additionally, the long and short forms of + the conditional expectation take the following forms + + .. math:: + + \\mathbb{E}[Y|D, X, A] &= \\theta \\cdot D + g(Z) + \\beta_A \\cdot A + + \\mathbb{E}[Y|D, X] &= (\\theta + \\gamma_A\\beta_A \\frac{\\mathrm{Var}(A)}{\\mathrm{Var}(D)}) \\cdot D + g(Z). + + Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`. + Both are chosen to obtain the desired confounding of the outcome and Riesz Representer (in sample). + + The observed data is given as :math:`W = (Y, D, X)`. + Further, orcale values of the confounder :math:`A`, the transformed covariated :math:`Z`, the effect :math:`\\theta`, + the coefficients :math:`\\gamma_a`, :math:`\\beta_a`, the long and short forms of the main regression and + the propensity score are returned in a dictionary. + + Parameters + ---------- + n_obs : int + The number of observations to simulate. + Default is ``500``. + theta : float or int + Average treatment effect. + Default is ``5.0``. + cf_y : float + Percentage of the residual variation of the outcome explained by latent/confounding variable. + Default is ``0.04``. + cf_d : float + Percentage gains in the variation of the Riesz Representer generated by latent/confounding variable. + Default is ``0.04``. + + Returns + ------- + res_dict : dictionary + Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``. + + References + ---------- + Sant'Anna, P. H. and Zhao, J. (2020), + Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122. + doi:`10.1016/j.jeconom.2020.06.003 `_. + """ + c = kwargs.get("c", 0.0) + dim_x = kwargs.get("dim_x", 4) + + # observed covariates + cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) + x = np.random.multivariate_normal( + np.zeros(dim_x), + cov_mat, + size=[ + n_obs, + ], + ) + + z_tilde_1 = np.exp(0.5 * x[:, 0]) + z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) + z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 + z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 + + z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, x[:, 4:])) + z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) + + # error terms + var_eps_y = 5 + eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs) + var_eps_d = 1 + eps_d = np.random.normal(loc=0, scale=np.sqrt(var_eps_d), size=n_obs) + + # unobserved confounder + a_bounds = (-1, 1) + a = np.random.uniform(low=a_bounds[0], high=a_bounds[1], size=n_obs) + var_a = np.square(a_bounds[1] - a_bounds[0]) / 12 + + # get the required impact of the confounder on the propensity score + m_short = -z[:, 0] + 0.5 * z[:, 1] - 0.25 * z[:, 2] - 0.1 * z[:, 3] + + def f_m(gamma_a): + rr_long = eps_d / var_eps_d + rr_short = (gamma_a * a + eps_d) / (gamma_a**2 * var_a + var_eps_d) + C2_D = (np.mean(np.square(rr_long)) - np.mean(np.square(rr_short))) / np.mean(np.square(rr_short)) + return np.square(C2_D / (1 + C2_D) - cf_d) + + gamma_a = minimize_scalar(f_m).x + m_long = m_short + gamma_a * a + d = m_long + eps_d + + # short and long version of g + g_partial_reg = 210 + 27.4 * z[:, 0] + 13.7 * (z[:, 1] + z[:, 2] + z[:, 3]) + + var_d = np.var(d) + + def f_g(beta_a): + g_diff = beta_a * (a - gamma_a * (var_a / var_d) * d) + y_diff = eps_y + g_diff + return np.square(np.mean(np.square(g_diff)) / np.mean(np.square(y_diff)) - cf_y) + + beta_a = minimize_scalar(f_g).x + + g_long = theta * d + g_partial_reg + beta_a * a + g_short = (theta + gamma_a * beta_a * var_a / var_d) * d + g_partial_reg + + y = g_long + eps_y + + oracle_values = { + "g_long": g_long, + "g_short": g_short, + "m_long": m_long, + "m_short": m_short, + "theta": theta, + "gamma_a": gamma_a, + "beta_a": beta_a, + "a": a, + "z": z, + } + + res_dict = {"x": x, "y": y, "d": d, "oracle_values": oracle_values} + + return res_dict diff --git a/doubleml/plm/datasets/dgp_pliv_CHS2015.py b/doubleml/plm/datasets/dgp_pliv_CHS2015.py new file mode 100644 index 000000000..7542803a2 --- /dev/null +++ b/doubleml/plm/datasets/dgp_pliv_CHS2015.py @@ -0,0 +1,108 @@ +import numpy as np +import pandas as pd +from scipy.linalg import toeplitz + +from doubleml.data import DoubleMLData +from doubleml.utils._aliases import _array_alias, _data_frame_alias, _dml_data_alias + + +def make_pliv_CHS2015(n_obs, alpha=1.0, dim_x=200, dim_z=150, return_type="DoubleMLData"): + """ + Generates data from a partially linear IV regression model used in Chernozhukov, Hansen and Spindler (2015). + The data generating process is defined as + + .. math:: + + z_i &= \\Pi x_i + \\zeta_i, + + d_i &= x_i' \\gamma + z_i' \\delta + u_i, + + y_i &= \\alpha d_i + x_i' \\beta + \\varepsilon_i, + + with + + .. math:: + + \\left(\\begin{matrix} \\varepsilon_i \\\\ u_i \\\\ \\zeta_i \\\\ x_i \\end{matrix} \\right) \\sim + \\mathcal{N}\\left(0, \\left(\\begin{matrix} 1 & 0.6 & 0 & 0 \\\\ 0.6 & 1 & 0 & 0 \\\\ + 0 & 0 & 0.25 I_{p_n^z} & 0 \\\\ 0 & 0 & 0 & \\Sigma \\end{matrix} \\right) \\right) + + where :math:`\\Sigma` is a :math:`p_n^x \\times p_n^x` matrix with entries + :math:`\\Sigma_{kj} = 0.5^{|j-k|}` and :math:`I_{p_n^z}` is the :math:`p_n^z \\times p_n^z` identity matrix. + :math:`\\beta = \\gamma` is a :math:`p_n^x`-vector with entries :math:`\\beta_j=\\frac{1}{j^2}`, + :math:`\\delta` is a :math:`p_n^z`-vector with entries :math:`\\delta_j=\\frac{1}{j^2}` + and :math:`\\Pi = (I_{p_n^z}, 0_{p_n^z \\times (p_n^x - p_n^z)})`. + + Parameters + ---------- + n_obs : + The number of observations to simulate. + alpha : + The value of the causal parameter. + dim_x : + The number of covariates. + dim_z : + The number of instruments. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z)``. + + References + ---------- + Chernozhukov, V., Hansen, C. and Spindler, M. (2015), Post-Selection and Post-Regularization Inference in Linear + Models with Many Controls and Instruments. American Economic Review: Papers and Proceedings, 105 (5): 486-90. + """ + assert dim_x >= dim_z + # see https://assets.aeaweb.org/asset-server/articles-attachments/aer/app/10505/P2015_1022_app.pdf + xx = np.random.multivariate_normal( + np.zeros(2), + np.array([[1.0, 0.6], [0.6, 1.0]]), + size=[ + n_obs, + ], + ) + epsilon = xx[:, 0] + u = xx[:, 1] + + sigma = toeplitz([np.power(0.5, k) for k in range(0, dim_x)]) + x = np.random.multivariate_normal( + np.zeros(dim_x), + sigma, + size=[ + n_obs, + ], + ) + + I_z = np.eye(dim_z) + xi = np.random.multivariate_normal( + np.zeros(dim_z), + 0.25 * I_z, + size=[ + n_obs, + ], + ) + + beta = [1 / (k**2) for k in range(1, dim_x + 1)] + gamma = beta + delta = [1 / (k**2) for k in range(1, dim_z + 1)] + Pi = np.hstack((I_z, np.zeros((dim_z, dim_x - dim_z)))) + + z = np.dot(x, np.transpose(Pi)) + xi + d = np.dot(x, gamma) + np.dot(z, delta) + u + y = alpha * d + np.dot(x, beta) + epsilon + + if return_type in _array_alias: + return x, y, d, z + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] + z_cols = [f"Z{i + 1}" for i in np.arange(dim_z)] + data = pd.DataFrame(np.column_stack((x, y, d, z)), columns=x_cols + ["y", "d"] + z_cols) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, "y", "d", x_cols, z_cols) + else: + raise ValueError("Invalid return_type.") diff --git a/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py b/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py new file mode 100644 index 000000000..df2b4cbe5 --- /dev/null +++ b/doubleml/plm/datasets/dgp_pliv_multiway_cluster_CKMS2021.py @@ -0,0 +1,199 @@ +import numpy as np +import pandas as pd +from scipy.linalg import toeplitz + +from doubleml.data import DoubleMLClusterData +from doubleml.utils._aliases import _array_alias, _data_frame_alias, _dml_cluster_data_alias + + +def make_pliv_multiway_cluster_CKMS2021(N=25, M=25, dim_X=100, theta=1.0, return_type="DoubleMLClusterData", **kwargs): + """ + Generates data from a partially linear IV regression model with multiway cluster sample used in Chiang et al. + (2021). The data generating process is defined as + + .. math:: + + Z_{ij} &= X_{ij}' \\xi_0 + V_{ij}, + + D_{ij} &= Z_{ij}' \\pi_{10} + X_{ij}' \\pi_{20} + v_{ij}, + + Y_{ij} &= D_{ij} \\theta + X_{ij}' \\zeta_0 + \\varepsilon_{ij}, + + with + + .. math:: + + X_{ij} &= (1 - \\omega_1^X - \\omega_2^X) \\alpha_{ij}^X + + \\omega_1^X \\alpha_{i}^X + \\omega_2^X \\alpha_{j}^X, + + \\varepsilon_{ij} &= (1 - \\omega_1^\\varepsilon - \\omega_2^\\varepsilon) \\alpha_{ij}^\\varepsilon + + \\omega_1^\\varepsilon \\alpha_{i}^\\varepsilon + \\omega_2^\\varepsilon \\alpha_{j}^\\varepsilon, + + v_{ij} &= (1 - \\omega_1^v - \\omega_2^v) \\alpha_{ij}^v + + \\omega_1^v \\alpha_{i}^v + \\omega_2^v \\alpha_{j}^v, + + V_{ij} &= (1 - \\omega_1^V - \\omega_2^V) \\alpha_{ij}^V + + \\omega_1^V \\alpha_{i}^V + \\omega_2^V \\alpha_{j}^V, + + and :math:`\\alpha_{ij}^X, \\alpha_{i}^X, \\alpha_{j}^X \\sim \\mathcal{N}(0, \\Sigma)` + where :math:`\\Sigma` is a :math:`p_x \\times p_x` matrix with entries + :math:`\\Sigma_{kj} = s_X^{|j-k|}`. + Further + + .. math:: + + \\left(\\begin{matrix} \\alpha_{ij}^\\varepsilon \\\\ \\alpha_{ij}^v \\end{matrix}\\right), + \\left(\\begin{matrix} \\alpha_{i}^\\varepsilon \\\\ \\alpha_{i}^v \\end{matrix}\\right), + \\left(\\begin{matrix} \\alpha_{j}^\\varepsilon \\\\ \\alpha_{j}^v \\end{matrix}\\right) + \\sim \\mathcal{N}\\left(0, \\left(\\begin{matrix} 1 & s_{\\varepsilon v} \\\\ + s_{\\varepsilon v} & 1 \\end{matrix} \\right) \\right) + + + and :math:`\\alpha_{ij}^V, \\alpha_{i}^V, \\alpha_{j}^V \\sim \\mathcal{N}(0, 1)`. + + Parameters + ---------- + N : + The number of observations (first dimension). + M : + The number of observations (second dimension). + dim_X : + The number of covariates. + theta : + The value of the causal parameter. + return_type : + If ``'DoubleMLClusterData'`` or ``DoubleMLClusterData``, returns a ``DoubleMLClusterData`` object where + ``DoubleMLClusterData.data`` is a ``pd.DataFrame``. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s + ``(x, y, d, cluster_vars, z)``. + **kwargs + Additional keyword arguments to set non-default values for the parameters + :math:`\\pi_{10}=1.0`, :math:`\\omega_X = \\omega_{\\varepsilon} = \\omega_V = \\omega_v = (0.25, 0.25)`, + :math:`s_X = s_{\\varepsilon v} = 0.25`, + or the :math:`p_x`-vectors :math:`\\zeta_0 = \\pi_{20} = \\xi_0` with default entries + :math:`(\\zeta_{0})_j = 0.5^j`. + + References + ---------- + Chiang, H. D., Kato K., Ma, Y. and Sasaki, Y. (2021), Multiway Cluster Robust Double/Debiased Machine Learning, + Journal of Business & Economic Statistics, + doi: `10.1080/07350015.2021.1895815 `_, + arXiv:`1909.03489 `_. + """ + # additional parameters specifiable via kwargs + pi_10 = kwargs.get("pi_10", 1.0) + + xx = np.arange(1, dim_X + 1) + zeta_0 = kwargs.get("zeta_0", np.power(0.5, xx)) + pi_20 = kwargs.get("pi_20", np.power(0.5, xx)) + xi_0 = kwargs.get("xi_0", np.power(0.5, xx)) + + omega_X = kwargs.get("omega_X", np.array([0.25, 0.25])) + omega_epsilon = kwargs.get("omega_epsilon", np.array([0.25, 0.25])) + omega_v = kwargs.get("omega_v", np.array([0.25, 0.25])) + omega_V = kwargs.get("omega_V", np.array([0.25, 0.25])) + + s_X = kwargs.get("s_X", 0.25) + s_epsilon_v = kwargs.get("s_epsilon_v", 0.25) + + # use np.tile() and np.repeat() for repeating vectors in different styles, i.e., + # np.tile([v1, v2, v3], 2) [v1, v2, v3, v1, v2, v3] + # np.repeat([v1, v2, v3], 2) [v1, v1, v2, v2, v3, v3] + + alpha_V = np.random.normal(size=(N * M)) + alpha_V_i = np.repeat(np.random.normal(size=N), M) + alpha_V_j = np.tile(np.random.normal(size=M), N) + + cov_mat = np.array([[1, s_epsilon_v], [s_epsilon_v, 1]]) + alpha_eps_v = np.random.multivariate_normal( + np.zeros(2), + cov_mat, + size=[ + N * M, + ], + ) + alpha_eps = alpha_eps_v[:, 0] + alpha_v = alpha_eps_v[:, 1] + + alpha_eps_v_i = np.random.multivariate_normal( + np.zeros(2), + cov_mat, + size=[ + N, + ], + ) + alpha_eps_i = np.repeat(alpha_eps_v_i[:, 0], M) + alpha_v_i = np.repeat(alpha_eps_v_i[:, 1], M) + + alpha_eps_v_j = np.random.multivariate_normal( + np.zeros(2), + cov_mat, + size=[ + M, + ], + ) + alpha_eps_j = np.tile(alpha_eps_v_j[:, 0], N) + alpha_v_j = np.tile(alpha_eps_v_j[:, 1], N) + + cov_mat = toeplitz([np.power(s_X, k) for k in range(dim_X)]) + alpha_X = np.random.multivariate_normal( + np.zeros(dim_X), + cov_mat, + size=[ + N * M, + ], + ) + alpha_X_i = np.repeat( + np.random.multivariate_normal( + np.zeros(dim_X), + cov_mat, + size=[ + N, + ], + ), + M, + axis=0, + ) + alpha_X_j = np.tile( + np.random.multivariate_normal( + np.zeros(dim_X), + cov_mat, + size=[ + M, + ], + ), + (N, 1), + ) + + # generate variables + x = (1 - omega_X[0] - omega_X[1]) * alpha_X + omega_X[0] * alpha_X_i + omega_X[1] * alpha_X_j + + eps = ( + (1 - omega_epsilon[0] - omega_epsilon[1]) * alpha_eps + omega_epsilon[0] * alpha_eps_i + omega_epsilon[1] * alpha_eps_j + ) + + v = (1 - omega_v[0] - omega_v[1]) * alpha_v + omega_v[0] * alpha_v_i + omega_v[1] * alpha_v_j + + V = (1 - omega_V[0] - omega_V[1]) * alpha_V + omega_V[0] * alpha_V_i + omega_V[1] * alpha_V_j + + z = np.matmul(x, xi_0) + V + d = z * pi_10 + np.matmul(x, pi_20) + v + y = d * theta + np.matmul(x, zeta_0) + eps + + cluster_cols = ["cluster_var_i", "cluster_var_j"] + cluster_vars = pd.MultiIndex.from_product([range(N), range(M)]).to_frame(name=cluster_cols).reset_index(drop=True) + + if return_type in _array_alias: + return x, y, d, cluster_vars.values, z + elif return_type in _data_frame_alias + _dml_cluster_data_alias: + x_cols = [f"X{i + 1}" for i in np.arange(dim_X)] + data = pd.concat((cluster_vars, pd.DataFrame(np.column_stack((x, y, d, z)), columns=x_cols + ["Y", "D", "Z"])), axis=1) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLClusterData(data, "Y", "D", cluster_cols, x_cols, "Z") + else: + raise ValueError("Invalid return_type.") diff --git a/doubleml/plm/datasets/dgp_plr_CCDDHNR2018.py b/doubleml/plm/datasets/dgp_plr_CCDDHNR2018.py new file mode 100644 index 000000000..7d6fdf9e8 --- /dev/null +++ b/doubleml/plm/datasets/dgp_plr_CCDDHNR2018.py @@ -0,0 +1,108 @@ +import numpy as np +import pandas as pd +from scipy.linalg import toeplitz + +from doubleml.data import DoubleMLData +from doubleml.utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_data_alias + +_array_alias = _get_array_alias() +_data_frame_alias = _get_data_frame_alias() +_dml_data_alias = _get_dml_data_alias() + + +def make_plr_CCDDHNR2018(n_obs=500, dim_x=20, alpha=0.5, return_type="DoubleMLData", **kwargs): + """ + Generates data from a partially linear regression model used in Chernozhukov et al. (2018) for Figure 1. + The data generating process is defined as + + .. math:: + + d_i &= m_0(x_i) + s_1 v_i, & &v_i \\sim \\mathcal{N}(0,1), + + y_i &= \\alpha d_i + g_0(x_i) + s_2 \\zeta_i, & &\\zeta_i \\sim \\mathcal{N}(0,1), + + + with covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries + :math:`\\Sigma_{kj} = 0.7^{|j-k|}`. + The nuisance functions are given by + + .. math:: + + m_0(x_i) &= a_0 x_{i,1} + a_1 \\frac{\\exp(x_{i,3})}{1+\\exp(x_{i,3})}, + + g_0(x_i) &= b_0 \\frac{\\exp(x_{i,1})}{1+\\exp(x_{i,1})} + b_1 x_{i,3}. + + Parameters + ---------- + n_obs : + The number of observations to simulate. + dim_x : + The number of covariates. + alpha : + The value of the causal parameter. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``. + **kwargs + Additional keyword arguments to set non-default values for the parameters + :math:`a_0=1`, :math:`a_1=0.25`, :math:`s_1=1`, :math:`b_0=1`, :math:`b_1=0.25` or :math:`s_2=1`. + + References + ---------- + Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), + Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. + doi:`10.1111/ectj.12097 `_. + """ + a_0 = kwargs.get("a_0", 1.0) + a_1 = kwargs.get("a_1", 0.25) + s_1 = kwargs.get("s_1", 1.0) + + b_0 = kwargs.get("b_0", 1.0) + b_1 = kwargs.get("b_1", 0.25) + s_2 = kwargs.get("s_2", 1.0) + + cov_mat = toeplitz([np.power(0.7, k) for k in range(dim_x)]) + x = np.random.multivariate_normal( + np.zeros(dim_x), + cov_mat, + size=[ + n_obs, + ], + ) + + d = ( + a_0 * x[:, 0] + + a_1 * np.divide(np.exp(x[:, 2]), 1 + np.exp(x[:, 2])) + + s_1 + * np.random.standard_normal( + size=[ + n_obs, + ] + ) + ) + y = ( + alpha * d + + b_0 * np.divide(np.exp(x[:, 0]), 1 + np.exp(x[:, 0])) + + b_1 * x[:, 2] + + s_2 + * np.random.standard_normal( + size=[ + n_obs, + ] + ) + ) + + if return_type in _array_alias: + return x, y, d + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d)), columns=x_cols + ["y", "d"]) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, "y", "d", x_cols) + else: + raise ValueError("Invalid return_type.") diff --git a/doubleml/plm/datasets/dgp_plr_turrell2018.py b/doubleml/plm/datasets/dgp_plr_turrell2018.py new file mode 100644 index 000000000..5cfefdd8f --- /dev/null +++ b/doubleml/plm/datasets/dgp_plr_turrell2018.py @@ -0,0 +1,107 @@ +import numpy as np +import pandas as pd +from sklearn.datasets import make_spd_matrix + +from doubleml.data import DoubleMLData +from doubleml.utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_data_alias + +_array_alias = _get_array_alias() +_data_frame_alias = _get_data_frame_alias() +_dml_data_alias = _get_dml_data_alias() + + +def _g(x): + return np.power(np.sin(x), 2) + + +def _m(x, nu=0.0, gamma=1.0): + return 0.5 / np.pi * (np.sinh(gamma)) / (np.cosh(gamma) - np.cos(x - nu)) + + +def make_plr_turrell2018(n_obs=100, dim_x=20, theta=0.5, return_type="DoubleMLData", **kwargs): + """ + Generates data from a partially linear regression model used in a blog article by Turrell (2018). + The data generating process is defined as + + .. math:: + + d_i &= m_0(x_i' b) + v_i, & &v_i \\sim \\mathcal{N}(0,1), + + y_i &= \\theta d_i + g_0(x_i' b) + u_i, & &u_i \\sim \\mathcal{N}(0,1), + + + with covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a random symmetric, + positive-definite matrix generated with :py:meth:`sklearn.datasets.make_spd_matrix`. + :math:`b` is a vector with entries :math:`b_j=\\frac{1}{j}` and the nuisance functions are given by + + .. math:: + + m_0(x_i) &= \\frac{1}{2 \\pi} \\frac{\\sinh(\\gamma)}{\\cosh(\\gamma) - \\cos(x_i-\\nu)}, + + g_0(x_i) &= \\sin(x_i)^2. + + Parameters + ---------- + n_obs : + The number of observations to simulate. + dim_x : + The number of covariates. + theta : + The value of the causal parameter. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``. + **kwargs + Additional keyword arguments to set non-default values for the parameters + :math:`\\nu=0`, or :math:`\\gamma=1`. + + References + ---------- + Turrell, A. (2018), Econometrics in Python part I - Double machine learning, Markov Wanderer: A blog on economics, + science, coding and data. `https://aeturrell.com/blog/posts/econometrics-in-python-parti-ml/ + `_. + """ + nu = kwargs.get("nu", 0.0) + gamma = kwargs.get("gamma", 1.0) + + b = [1 / k for k in range(1, dim_x + 1)] + sigma = make_spd_matrix(dim_x) + + x = np.random.multivariate_normal( + np.zeros(dim_x), + sigma, + size=[ + n_obs, + ], + ) + G = _g(np.dot(x, b)) + M = _m(np.dot(x, b), nu=nu, gamma=gamma) + d = M + np.random.standard_normal( + size=[ + n_obs, + ] + ) + y = ( + np.dot(theta, d) + + G + + np.random.standard_normal( + size=[ + n_obs, + ] + ) + ) + + if return_type in _array_alias: + return x, y, d + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d)), columns=x_cols + ["y", "d"]) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, "y", "d", x_cols) + else: + raise ValueError("Invalid return_type.") diff --git a/doubleml/plm/pliv.py b/doubleml/plm/pliv.py index ba0226889..52cb796d3 100644 --- a/doubleml/plm/pliv.py +++ b/doubleml/plm/pliv.py @@ -62,7 +62,7 @@ class DoubleMLPLIV(LinearScoreMixin, DoubleML): -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_pliv_CHS2015 + >>> from doubleml.plm.datasets import make_pliv_CHS2015 >>> from sklearn.ensemble import RandomForestRegressor >>> from sklearn.base import clone >>> np.random.seed(3141) diff --git a/doubleml/plm/plr.py b/doubleml/plm/plr.py index a81bac48c..4a57dfcb0 100644 --- a/doubleml/plm/plr.py +++ b/doubleml/plm/plr.py @@ -60,7 +60,7 @@ class DoubleMLPLR(LinearScoreMixin, DoubleML): -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_plr_CCDDHNR2018 + >>> from doubleml.plm.datasets import make_plr_CCDDHNR2018 >>> from sklearn.ensemble import RandomForestRegressor >>> from sklearn.base import clone >>> np.random.seed(3141) diff --git a/doubleml/plm/tests/conftest.py b/doubleml/plm/tests/conftest.py index 497d6fc9d..cfde0f414 100644 --- a/doubleml/plm/tests/conftest.py +++ b/doubleml/plm/tests/conftest.py @@ -4,7 +4,7 @@ from scipy.linalg import toeplitz from sklearn.datasets import make_spd_matrix -from doubleml.datasets import make_pliv_CHS2015, make_plr_turrell2018 +from doubleml.plm.datasets import make_pliv_CHS2015, make_plr_turrell2018 def _g(x): diff --git a/doubleml/plm/tests/test_pliv_external_predictions.py b/doubleml/plm/tests/test_pliv_external_predictions.py index bc8a1e8a4..55c362abe 100644 --- a/doubleml/plm/tests/test_pliv_external_predictions.py +++ b/doubleml/plm/tests/test_pliv_external_predictions.py @@ -5,7 +5,7 @@ from sklearn.linear_model import LinearRegression from doubleml import DoubleMLData, DoubleMLPLIV -from doubleml.datasets import make_pliv_CHS2015 +from doubleml.plm.datasets import make_pliv_CHS2015 from doubleml.utils import DMLDummyRegressor diff --git a/doubleml/plm/tests/test_plr.py b/doubleml/plm/tests/test_plr.py index 79f21f849..65f5ad834 100644 --- a/doubleml/plm/tests/test_plr.py +++ b/doubleml/plm/tests/test_plr.py @@ -304,7 +304,7 @@ def test_dml_plr_cate_gate(score, cov_type): # collect data np.random.seed(42) - obj_dml_data = dml.datasets.make_plr_CCDDHNR2018(n_obs=n) + obj_dml_data = dml.plm.datasets.make_plr_CCDDHNR2018(n_obs=n) ml_l = LinearRegression() ml_g = LinearRegression() ml_m = LinearRegression() diff --git a/doubleml/plm/tests/test_plr_external_predictions.py b/doubleml/plm/tests/test_plr_external_predictions.py index 47644555d..160052b14 100644 --- a/doubleml/plm/tests/test_plr_external_predictions.py +++ b/doubleml/plm/tests/test_plr_external_predictions.py @@ -5,7 +5,7 @@ from sklearn.linear_model import LinearRegression from doubleml import DoubleMLData, DoubleMLPLR -from doubleml.datasets import make_plr_CCDDHNR2018 +from doubleml.plm.datasets import make_plr_CCDDHNR2018 from doubleml.utils import DMLDummyRegressor diff --git a/doubleml/tests/conftest.py b/doubleml/tests/conftest.py index bf53d788f..6abea18c1 100644 --- a/doubleml/tests/conftest.py +++ b/doubleml/tests/conftest.py @@ -4,7 +4,7 @@ from sklearn.datasets import make_classification, make_regression, make_spd_matrix from doubleml import DoubleMLData -from doubleml.datasets import make_pliv_CHS2015, make_plr_turrell2018 +from doubleml.plm.datasets import make_pliv_CHS2015, make_plr_turrell2018 def _g(x): diff --git a/doubleml/tests/test_datasets.py b/doubleml/tests/test_datasets.py index 67f612e80..8f1c4f033 100644 --- a/doubleml/tests/test_datasets.py +++ b/doubleml/tests/test_datasets.py @@ -3,21 +3,22 @@ import pytest from doubleml import DoubleMLClusterData, DoubleMLData -from doubleml.datasets import ( - _make_pliv_data, - fetch_401K, - fetch_bonus, +from doubleml.datasets import fetch_401K, fetch_bonus +from doubleml.irm.datasets import ( make_confounded_irm_data, - make_confounded_plr_data, make_heterogeneous_data, make_iivm_data, make_irm_data, make_irm_data_discrete_treatments, + make_ssm_data, +) +from doubleml.plm.datasets import ( + _make_pliv_data, + make_confounded_plr_data, make_pliv_CHS2015, make_pliv_multiway_cluster_CKMS2021, make_plr_CCDDHNR2018, make_plr_turrell2018, - make_ssm_data, ) msg_inv_return_type = "Invalid return_type." diff --git a/doubleml/tests/test_evaluate_learner.py b/doubleml/tests/test_evaluate_learner.py index dbad9b620..2c5d3f9a6 100644 --- a/doubleml/tests/test_evaluate_learner.py +++ b/doubleml/tests/test_evaluate_learner.py @@ -5,7 +5,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression import doubleml as dml -from doubleml.datasets import make_irm_data +from doubleml.irm.datasets import make_irm_data from doubleml.utils._estimation import _logloss np.random.seed(3141) diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py index a4655bb9d..d8fe4e7cb 100644 --- a/doubleml/tests/test_exceptions.py +++ b/doubleml/tests/test_exceptions.py @@ -21,13 +21,8 @@ DoubleMLPQ, DoubleMLQTE, ) -from doubleml.datasets import ( - make_iivm_data, - make_irm_data, - make_pliv_CHS2015, - make_pliv_multiway_cluster_CKMS2021, - make_plr_CCDDHNR2018, -) +from doubleml.irm.datasets import make_iivm_data, make_irm_data +from doubleml.plm.datasets import make_pliv_CHS2015, make_pliv_multiway_cluster_CKMS2021, make_plr_CCDDHNR2018 from doubleml.did.datasets import make_did_SZ2020 from ._utils import DummyDataClass diff --git a/doubleml/tests/test_exceptions_ext_preds.py b/doubleml/tests/test_exceptions_ext_preds.py index 3f6002825..a65b6ebb7 100644 --- a/doubleml/tests/test_exceptions_ext_preds.py +++ b/doubleml/tests/test_exceptions_ext_preds.py @@ -2,7 +2,7 @@ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from doubleml import DoubleMLCVAR, DoubleMLData, DoubleMLIRM, DoubleMLQTE -from doubleml.datasets import make_irm_data +from doubleml.irm.datasets import make_irm_data from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor df_irm = make_irm_data(n_obs=10, dim_x=2, theta=0.5, return_type="DataFrame") diff --git a/doubleml/tests/test_framework.py b/doubleml/tests/test_framework.py index 24810b680..44dabb713 100644 --- a/doubleml/tests/test_framework.py +++ b/doubleml/tests/test_framework.py @@ -3,7 +3,7 @@ import pytest from sklearn.linear_model import LinearRegression, LogisticRegression -from doubleml.datasets import make_irm_data +from doubleml.irm.datasets import make_irm_data from doubleml.double_ml_framework import DoubleMLFramework, concat from doubleml.irm.irm import DoubleMLIRM diff --git a/doubleml/tests/test_model_defaults.py b/doubleml/tests/test_model_defaults.py index f55a555cc..8417468ad 100644 --- a/doubleml/tests/test_model_defaults.py +++ b/doubleml/tests/test_model_defaults.py @@ -4,13 +4,8 @@ from sklearn.linear_model import Lasso, LogisticRegression import doubleml as dml -from doubleml.datasets import ( - make_iivm_data, - make_irm_data, - make_pliv_CHS2015, - make_plr_CCDDHNR2018, - make_ssm_data, -) +from doubleml.irm.datasets import make_iivm_data, make_irm_data, make_ssm_data +from doubleml.plm.datasets import make_pliv_CHS2015, make_plr_CCDDHNR2018 from doubleml.did.datasets import make_did_SZ2020 np.random.seed(3141) diff --git a/doubleml/tests/test_multiway_cluster.py b/doubleml/tests/test_multiway_cluster.py index b064024f0..10e5d4457 100644 --- a/doubleml/tests/test_multiway_cluster.py +++ b/doubleml/tests/test_multiway_cluster.py @@ -6,7 +6,7 @@ from sklearn.linear_model import Lasso, LinearRegression import doubleml as dml -from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021 +from doubleml.plm.datasets import make_pliv_multiway_cluster_CKMS2021 from ..plm.tests._utils_pliv_manual import compute_pliv_residuals, fit_pliv from ._utils import _clone diff --git a/doubleml/tests/test_nonlinear_cluster.py b/doubleml/tests/test_nonlinear_cluster.py index f84f3e2e9..71998941c 100644 --- a/doubleml/tests/test_nonlinear_cluster.py +++ b/doubleml/tests/test_nonlinear_cluster.py @@ -7,7 +7,8 @@ from sklearn.linear_model import Lasso, LinearRegression import doubleml as dml -from doubleml.datasets import DoubleMLClusterData, make_pliv_multiway_cluster_CKMS2021 +from doubleml import DoubleMLClusterData +from doubleml.plm.datasets import make_pliv_multiway_cluster_CKMS2021 from .test_nonlinear_score_mixin import DoubleMLPLRWithNonLinearScoreMixin diff --git a/doubleml/tests/test_return_types.py b/doubleml/tests/test_return_types.py index 11ebd6248..03676b747 100644 --- a/doubleml/tests/test_return_types.py +++ b/doubleml/tests/test_return_types.py @@ -23,14 +23,8 @@ DoubleMLPQ, DoubleMLSSM, ) -from doubleml.datasets import ( - make_iivm_data, - make_irm_data, - make_pliv_CHS2015, - make_pliv_multiway_cluster_CKMS2021, - make_plr_CCDDHNR2018, - make_ssm_data, -) +from doubleml.irm.datasets import make_iivm_data, make_irm_data, make_ssm_data +from doubleml.plm.datasets import make_pliv_CHS2015, make_pliv_multiway_cluster_CKMS2021, make_plr_CCDDHNR2018 from doubleml.did.datasets import make_did_SZ2020 np.random.seed(3141) diff --git a/doubleml/tests/test_scores.py b/doubleml/tests/test_scores.py index c3281702d..0687546dc 100644 --- a/doubleml/tests/test_scores.py +++ b/doubleml/tests/test_scores.py @@ -3,7 +3,8 @@ from sklearn.linear_model import Lasso, LogisticRegression from doubleml import DoubleMLIIVM, DoubleMLIRM, DoubleMLPLIV, DoubleMLPLR -from doubleml.datasets import make_iivm_data, make_irm_data, make_pliv_CHS2015, make_plr_CCDDHNR2018 +from doubleml.irm.datasets import make_iivm_data, make_irm_data +from doubleml.plm.datasets import make_pliv_CHS2015, make_plr_CCDDHNR2018 np.random.seed(3141) dml_data_plr = make_plr_CCDDHNR2018(n_obs=100) diff --git a/doubleml/tests/test_sensitivity.py b/doubleml/tests/test_sensitivity.py index e4b434959..a0e47c0da 100644 --- a/doubleml/tests/test_sensitivity.py +++ b/doubleml/tests/test_sensitivity.py @@ -5,7 +5,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression import doubleml as dml -from doubleml.datasets import make_irm_data +from doubleml.irm.datasets import make_irm_data from ._utils_doubleml_sensitivity_manual import doubleml_sensitivity_benchmark_manual, doubleml_sensitivity_manual diff --git a/doubleml/tests/test_sensitivity_cluster.py b/doubleml/tests/test_sensitivity_cluster.py index 65ec0d644..83f8c270c 100644 --- a/doubleml/tests/test_sensitivity_cluster.py +++ b/doubleml/tests/test_sensitivity_cluster.py @@ -5,7 +5,7 @@ from sklearn.linear_model import LinearRegression import doubleml as dml -from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021 +from doubleml.plm.datasets import make_pliv_multiway_cluster_CKMS2021 from ._utils_doubleml_sensitivity_manual import doubleml_sensitivity_benchmark_manual diff --git a/doubleml/tests/test_set_ml_nuisance_params.py b/doubleml/tests/test_set_ml_nuisance_params.py index a189b184b..055bcbff7 100644 --- a/doubleml/tests/test_set_ml_nuisance_params.py +++ b/doubleml/tests/test_set_ml_nuisance_params.py @@ -3,7 +3,8 @@ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from doubleml import DoubleMLCVAR, DoubleMLIIVM, DoubleMLIRM, DoubleMLLPQ, DoubleMLPLIV, DoubleMLPLR, DoubleMLPQ -from doubleml.datasets import make_iivm_data, make_irm_data, make_pliv_CHS2015, make_plr_CCDDHNR2018 +from doubleml.irm.datasets import make_iivm_data, make_irm_data +from doubleml.plm.datasets import make_pliv_CHS2015, make_plr_CCDDHNR2018 # set default and test values n_est_default = 100 diff --git a/doubleml/tests/test_set_sample_splitting.py b/doubleml/tests/test_set_sample_splitting.py index 97313a00e..0995d831b 100644 --- a/doubleml/tests/test_set_sample_splitting.py +++ b/doubleml/tests/test_set_sample_splitting.py @@ -3,7 +3,7 @@ from sklearn.linear_model import Lasso from doubleml import DoubleMLPLR -from doubleml.datasets import make_plr_CCDDHNR2018 +from doubleml.plm.datasets import make_plr_CCDDHNR2018 np.random.seed(3141) dml_data = make_plr_CCDDHNR2018(n_obs=10)