Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 20 additions & 15 deletions doubleml/rdd/datasets/simple_dgp.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,47 +5,52 @@
def make_simple_rdd_data(n_obs=5000, p=4, fuzzy=True, binary_outcome=False, **kwargs):
"""
Generates synthetic data for a regression discontinuity design (RDD) analysis.
The data generating process is defined as

.. math::
Y_0 &= g_0 + g_{cov} + \\epsilon_0 \\
Y_1 &= g_1 + g_{cov} + \\epsilon_1 \\
g_0 &= 0.1 \\cdot \\text{score}^2 \\
g_1 &= \tau + 0.1 \\cdot \\text{score}^2 - 0.5 \\cdot \\text{score}^2 \\
g_{cov} &= \\sum_{i=1}^{\text{dim\\_x}} \text{Polynomial}(X_i) \\
\\epsilon_0, \\epsilon_1 &\\sim \\mathcal{N}(0, 0.2^2)
Y_0 &= g_0 + g_{cov} + \\epsilon_0,

Y_1 &= g_1 + g_{cov} + \\epsilon_1,

g_0 &= 0.1 \\cdot \\text{score}^2,

g_1 &= \\tau + 0.1 \\cdot score^2 - 0.5 \\cdot score^2 + a
\\sum_{i=1}^{\\text{dim}_x} X_i \\cdot score,

g_{cov} &= \\sum_{i=1}^{\\text{dim}_x} \\text{Polynomial}(X_i),

with random noise :math:`\\epsilon_0, \\epsilon_1 \\sim \\mathcal{N}(0, 0.2^2)` and :math:`X_i`
being drawn independently from a uniform distribution.

Parameters
----------
n_obs : int
Number of observations to generate. Default is 5000.

p : int
Degree of the polynomial for covariates. Default is 4.
Degree of the polynomial for covariates. Default is 4. If zero, no covariate effect is considered.

fuzzy : bool
If True, generates data for a fuzzy RDD. Default is True.

binary_outcome : bool
If True, generates binary outcomes. Default is False.
If True, generates binary outcomes based on a logistic transformation. Default is False.

**kwargs : Additional keyword arguments.
cutoff : float
The cutoff value for the score. Default is 0.0.
dim_x : int
The number of independent covariates. Default is 3.
a : float
Factor to control interaction of score and covariates to the outcome equation. Default is 0.0.
Factor to control interaction of score and covariates in the outcome equation. Default is 0.0.
tau : float
Parameter to control the true effect in the generated data at the given cutoff. Default is 1.0.

Returns
-------
dict: A dictionary containing the generated data with keys:
'score' (np.ndarray): The running variable.
'X' (np.ndarray): The independent covariates.
'Y0' (np.ndarray): The potential outcomes without treatment.
'Y1' (np.ndarray): The potential outcomes with treatment.
'intended_treatment' (np.ndarray): The intended treatment assignment.
res_dict : dictionary
Dictionary with entries ``score``, ``X``, ``Y``, ``D``, and ``oracle_values``.
The oracle values contain the potential outcomes.
"""

cutoff = kwargs.get('cutoff', 0.0)
Expand Down
22 changes: 17 additions & 5 deletions doubleml/rdd/rdd.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class RDFlex():
defined as :math:`\\eta_0(X) = (g_0^{+}(X) + g_0^{-}(X))/2`.

ml_m : classifier implementing ``fit()`` and ``predict_proba()`` or None
A machine learner implementing ``fit()`` and ``predict_proba()`` methods and support ``sample_weights``(e.g.
A machine learner implementing ``fit()`` and ``predict_proba()`` methods and support ``sample_weights`` (e.g.
:py:class:`sklearn.ensemble.RandomForestClassifier`) for the nuisance functions
:math:`m_0^{\\pm}(X) = E[D|\\text{score}=\\text{cutoff}^{\\pm}, X]`. The adjustment function is then
defined as :math:`\\eta_0(X) = (m_0^{+}(X) + m_0^{-}(X))/2`.
Expand Down Expand Up @@ -66,17 +66,29 @@ class RDFlex():
Default is ``cutoff``.

fs_kernel : str
Kernel for the first stage estimation. ``uniform``, ``triangular`` and ``epanechnikov``are supported.
Kernel for the first stage estimation. ``uniform``, ``triangular`` and ``epanechnikov`` are supported.
Default is ``triangular``.

**kwargs : kwargs
Key-worded arguments that are not used within RDFlex but directly handed to rdrobust.

Examples
--------

Notes
-----
>>> import numpy as np
>>> import doubleml as dml
>>> from doubleml.rdd.datasets import make_simple_rdd_data
>>> from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
>>> np.random.seed(123)
>>> data_dict = make_simple_rdd_data(fuzzy=True)
>>> obj_dml_data = dml.DoubleMLData.from_arrays(x=data_dict["X"], y=data_dict["Y"], d=data_dict["D"], s=data_dict["score"])
>>> ml_g = RandomForestRegressor()
>>> ml_m = RandomForestClassifier()
>>> rdflex_obj = dml.rdd.RDFlex(obj_dml_data, ml_g, ml_m, fuzzy=True)
>>> print(rdflex_obj.fit())
Method Coef. S.E. t-stat P>|t| 95% CI
-------------------------------------------------------------------------
Conventional 0.935 0.220 4.244 2.196e-05 [0.503, 1.367]
Robust - - 3.635 2.785e-04 [0.418, 1.396]

"""

Expand Down
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
'scikit-learn>=1.4.0,<1.6.0',
'statsmodels',
'plotly',
'matplotlib',
'rdrobust',
],
python_requires=">=3.9",
classifiers=[
Expand Down
Loading