diff --git a/doubleml/rdd/datasets/simple_dgp.py b/doubleml/rdd/datasets/simple_dgp.py index 0964ec585..abfd4ed39 100644 --- a/doubleml/rdd/datasets/simple_dgp.py +++ b/doubleml/rdd/datasets/simple_dgp.py @@ -5,14 +5,22 @@ def make_simple_rdd_data(n_obs=5000, p=4, fuzzy=True, binary_outcome=False, **kwargs): """ Generates synthetic data for a regression discontinuity design (RDD) analysis. + The data generating process is defined as .. math:: - Y_0 &= g_0 + g_{cov} + \\epsilon_0 \\ - Y_1 &= g_1 + g_{cov} + \\epsilon_1 \\ - g_0 &= 0.1 \\cdot \\text{score}^2 \\ - g_1 &= \tau + 0.1 \\cdot \\text{score}^2 - 0.5 \\cdot \\text{score}^2 \\ - g_{cov} &= \\sum_{i=1}^{\text{dim\\_x}} \text{Polynomial}(X_i) \\ - \\epsilon_0, \\epsilon_1 &\\sim \\mathcal{N}(0, 0.2^2) + Y_0 &= g_0 + g_{cov} + \\epsilon_0, + + Y_1 &= g_1 + g_{cov} + \\epsilon_1, + + g_0 &= 0.1 \\cdot \\text{score}^2, + + g_1 &= \\tau + 0.1 \\cdot score^2 - 0.5 \\cdot score^2 + a + \\sum_{i=1}^{\\text{dim}_x} X_i \\cdot score, + + g_{cov} &= \\sum_{i=1}^{\\text{dim}_x} \\text{Polynomial}(X_i), + + with random noise :math:`\\epsilon_0, \\epsilon_1 \\sim \\mathcal{N}(0, 0.2^2)` and :math:`X_i` + being drawn independently from a uniform distribution. Parameters ---------- @@ -20,13 +28,13 @@ def make_simple_rdd_data(n_obs=5000, p=4, fuzzy=True, binary_outcome=False, **kw Number of observations to generate. Default is 5000. p : int - Degree of the polynomial for covariates. Default is 4. + Degree of the polynomial for covariates. Default is 4. If zero, no covariate effect is considered. fuzzy : bool If True, generates data for a fuzzy RDD. Default is True. binary_outcome : bool - If True, generates binary outcomes. Default is False. + If True, generates binary outcomes based on a logistic transformation. Default is False. **kwargs : Additional keyword arguments. cutoff : float @@ -34,18 +42,15 @@ def make_simple_rdd_data(n_obs=5000, p=4, fuzzy=True, binary_outcome=False, **kw dim_x : int The number of independent covariates. Default is 3. a : float - Factor to control interaction of score and covariates to the outcome equation. Default is 0.0. + Factor to control interaction of score and covariates in the outcome equation. Default is 0.0. tau : float Parameter to control the true effect in the generated data at the given cutoff. Default is 1.0. Returns ------- - dict: A dictionary containing the generated data with keys: - 'score' (np.ndarray): The running variable. - 'X' (np.ndarray): The independent covariates. - 'Y0' (np.ndarray): The potential outcomes without treatment. - 'Y1' (np.ndarray): The potential outcomes with treatment. - 'intended_treatment' (np.ndarray): The intended treatment assignment. + res_dict : dictionary + Dictionary with entries ``score``, ``X``, ``Y``, ``D``, and ``oracle_values``. + The oracle values contain the potential outcomes. """ cutoff = kwargs.get('cutoff', 0.0) diff --git a/doubleml/rdd/rdd.py b/doubleml/rdd/rdd.py index 1bbe68306..a08e9f144 100644 --- a/doubleml/rdd/rdd.py +++ b/doubleml/rdd/rdd.py @@ -30,7 +30,7 @@ class RDFlex(): defined as :math:`\\eta_0(X) = (g_0^{+}(X) + g_0^{-}(X))/2`. ml_m : classifier implementing ``fit()`` and ``predict_proba()`` or None - A machine learner implementing ``fit()`` and ``predict_proba()`` methods and support ``sample_weights``(e.g. + A machine learner implementing ``fit()`` and ``predict_proba()`` methods and support ``sample_weights`` (e.g. :py:class:`sklearn.ensemble.RandomForestClassifier`) for the nuisance functions :math:`m_0^{\\pm}(X) = E[D|\\text{score}=\\text{cutoff}^{\\pm}, X]`. The adjustment function is then defined as :math:`\\eta_0(X) = (m_0^{+}(X) + m_0^{-}(X))/2`. @@ -66,7 +66,7 @@ class RDFlex(): Default is ``cutoff``. fs_kernel : str - Kernel for the first stage estimation. ``uniform``, ``triangular`` and ``epanechnikov``are supported. + Kernel for the first stage estimation. ``uniform``, ``triangular`` and ``epanechnikov`` are supported. Default is ``triangular``. **kwargs : kwargs @@ -74,9 +74,21 @@ class RDFlex(): Examples -------- - - Notes - ----- + >>> import numpy as np + >>> import doubleml as dml + >>> from doubleml.rdd.datasets import make_simple_rdd_data + >>> from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier + >>> np.random.seed(123) + >>> data_dict = make_simple_rdd_data(fuzzy=True) + >>> obj_dml_data = dml.DoubleMLData.from_arrays(x=data_dict["X"], y=data_dict["Y"], d=data_dict["D"], s=data_dict["score"]) + >>> ml_g = RandomForestRegressor() + >>> ml_m = RandomForestClassifier() + >>> rdflex_obj = dml.rdd.RDFlex(obj_dml_data, ml_g, ml_m, fuzzy=True) + >>> print(rdflex_obj.fit()) + Method Coef. S.E. t-stat P>|t| 95% CI + ------------------------------------------------------------------------- + Conventional 0.935 0.220 4.244 2.196e-05 [0.503, 1.367] + Robust - - 3.635 2.785e-04 [0.418, 1.396] """ diff --git a/setup.py b/setup.py index f7c331a98..93efebb2f 100644 --- a/setup.py +++ b/setup.py @@ -29,6 +29,8 @@ 'scikit-learn>=1.4.0,<1.6.0', 'statsmodels', 'plotly', + 'matplotlib', + 'rdrobust', ], python_requires=">=3.9", classifiers=[