DoubleML · OliverSchacht · Dec 3, 2024 · Aug 13, 2024 · Aug 13, 2024 · Aug 13, 2024
diff --git a/doubleml/double_ml_data.py b/doubleml/double_ml_data.py
@@ -110,7 +110,7 @@ class DoubleMLData(DoubleMLBaseData):
         Default is ``None``.
 
     s_col : None or str
-        The selection variable (only relevant/used for SSM Estimatiors).
+        The score or selection variable (only relevant/used for RDD or SSM Estimatiors).
         Default is ``None``.
 
     use_other_treat_as_covariate : bool
@@ -182,7 +182,7 @@ def _data_summary_str(self):
         if self.t_col is not None:
             data_summary += f'Time variable: {self.t_col}\n'
         if self.s_col is not None:
-            data_summary += f'Selection variable: {self.s_col}\n'
+            data_summary += f'Score/Selection variable: {self.s_col}\n'
         data_summary += f'No. Observations: {self.n_obs}\n'
         return data_summary
 
@@ -212,7 +212,7 @@ def from_arrays(cls, x, y, d, z=None, t=None, s=None, use_other_treat_as_covaria
             Default is ``None``.
 
         s : :class:`numpy.ndarray`
-            Array of the selection variable (only relevant/used for SSM models).
+            Array of the score or selection variable (only relevant/used for RDD and SSM models).
             Default is ``None``.
 
         use_other_treat_as_covariate : bool
@@ -351,7 +351,7 @@ def t(self):
     @property
     def s(self):
         """
-        Array of selection variable.
+        Array of score or selection variable.
         """
         if self.s_col is not None:
             return self._s.values
@@ -538,7 +538,7 @@ def t_col(self, value):
     @property
     def s_col(self):
         """
-        The selection variable.
+        The score or selection variable.
         """
         return self._s_col
 
@@ -547,10 +547,10 @@ def s_col(self, value):
         reset_value = hasattr(self, '_s_col')
         if value is not None:
             if not isinstance(value, str):
-                raise TypeError('The selection variable s_col must be of str type (or None). '
+                raise TypeError('The score or selection variable s_col must be of str type (or None). '
                                 f'{str(value)} of type {str(type(value))} was passed.')
             if value not in self.all_variables:
-                raise ValueError('Invalid selection variable s_col. '
+                raise ValueError('Invalid score or selection variable s_col. '
                                  f'{value} is no data column.')
             self._s_col = value
         else:
@@ -725,24 +725,24 @@ def _check_disjoint_sets_t_s(self):
         if self.s_col is not None:
             s_col_set = {self.s_col}
             if not s_col_set.isdisjoint(x_cols_set):
-                raise ValueError(f'{str(self.s_col)} cannot be set as selection variable ``s_col`` and covariate in '
+                raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and covariate in '
                                  '``x_cols``.')
             if not s_col_set.isdisjoint(d_cols_set):
-                raise ValueError(f'{str(self.s_col)} cannot be set as selection variable ``s_col`` and treatment variable in '
-                                 '``d_cols``.')
+                raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and treatment '
+                                 'variable in ``d_cols``.')
             if not s_col_set.isdisjoint(y_col_set):
-                raise ValueError(f'{str(self.s_col)} cannot be set as selection variable ``s_col`` and outcome variable '
-                                 '``y_col``.')
+                raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and outcome '
+                                 'variable ``y_col``.')
             if self.z_cols is not None:
                 z_cols_set = set(self.z_cols)
                 if not s_col_set.isdisjoint(z_cols_set):
-                    raise ValueError(f'{str(self.s_col)} cannot be set as selection variable ``s_col`` and instrumental '
-                                     'variable in ``z_cols``.')
+                    raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and '
+                                     'instrumental variable in ``z_cols``.')
             if self.t_col is not None:
                 t_col_set = {self.t_col}
                 if not s_col_set.isdisjoint(t_col_set):
-                    raise ValueError(f'{str(self.s_col)} cannot be set as selection variable ``s_col`` and time variable '
-                                     '``t_col``.')
+                    raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and time '
+                                     'variable ``t_col``.')
 
 
 class DoubleMLClusterData(DoubleMLData):
@@ -780,7 +780,7 @@ class DoubleMLClusterData(DoubleMLData):
         Default is ``None``.
 
     s_col : None or str
-        The selection variable (only relevant/used for SSM Estimatiors).
+        The score or selection variable (only relevant/used for RDD and SSM Estimatiors).
         Default is ``None``.
 
     use_other_treat_as_covariate : bool
@@ -854,7 +854,7 @@ def _data_summary_str(self):
         if self.t_col is not None:
             data_summary += f'Time variable: {self.t_col}\n'
         if self.s_col is not None:
-            data_summary += f'Selection variable: {self.s_col}\n'
+            data_summary += f'Score/Selection variable: {self.s_col}\n'
 
         data_summary += f'No. Observations: {self.n_obs}\n'
         return data_summary
@@ -888,7 +888,7 @@ def from_arrays(cls, x, y, d, cluster_vars, z=None, t=None, s=None, use_other_tr
             Default is ``None``.
 
         s : :class:`numpy.ndarray`
-            Array of the selection variable (only relevant/used for SSM models).
+            Array of the score or selection variable (only relevant/used for RDD or SSM models).
             Default is ``None``.
 
         use_other_treat_as_covariate : bool
@@ -1039,7 +1039,7 @@ def _check_disjoint_sets_cluster_cols(self):
                                  'cluster variable in ``cluster_cols``.')
         if self.s_col is not None:
             if not s_col_set.isdisjoint(cluster_cols_set):
-                raise ValueError(f'{str(self.s_col)} cannot be set as selection variable ``s_col`` and '
+                raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and '
                                  'cluster variable in ``cluster_cols``.')
 
     def _set_cluster_vars(self):

diff --git a/doubleml/rdd/__init__.py b/doubleml/rdd/__init__.py
@@ -0,0 +1,9 @@
+"""
+The :mod:`doubleml.rdd` module implements double machine learning estimates for regression discontinuity designs.
+"""
+
+from .rdd import RDFlex
+
+__all__ = [
+    "RDFlex",
+]
diff --git a/doubleml/rdd/datasets/__init__.py b/doubleml/rdd/datasets/__init__.py
@@ -0,0 +1,9 @@
+"""
+The :mod:`doubleml.rdd.datasets` module implements data generating processes for regression discontinuity designs.
+"""
+
+from .simple_dgp import make_simple_rdd_data
+
+__all__ = [
+    "make_simple_rdd_data",
+]
diff --git a/doubleml/rdd/datasets/simple_dgp.py b/doubleml/rdd/datasets/simple_dgp.py
@@ -0,0 +1,103 @@
+import numpy as np
+from numpy.polynomial.polynomial import Polynomial
+
+
+def make_simple_rdd_data(n_obs=5000, p=4, fuzzy=True, binary_outcome=False, **kwargs):
+    """
+    Generates synthetic data for a regression discontinuity design (RDD) analysis.
+
+    .. math::
+        Y_0 &= g_0 + g_{cov} + \\epsilon_0 \\
+        Y_1 &= g_1 + g_{cov} + \\epsilon_1 \\
+        g_0 &= 0.1 \\cdot \\text{score}^2 \\
+        g_1 &= \tau + 0.1 \\cdot \\text{score}^2 - 0.5 \\cdot \\text{score}^2 \\
+        g_{cov} &= \\sum_{i=1}^{\text{dim\\_x}} \text{Polynomial}(X_i) \\
+        \\epsilon_0, \\epsilon_1 &\\sim \\mathcal{N}(0, 0.2^2)
+
+    Parameters
+    ----------
+    n_obs : int
+        Number of observations to generate. Default is 5000.
+
+    p : int
+        Degree of the polynomial for covariates. Default is 4.
+
+    fuzzy : bool
+        If True, generates data for a fuzzy RDD. Default is True.
+
+    binary_outcome : bool
+        If True, generates binary outcomes. Default is False.
+
+    **kwargs : Additional keyword arguments.
+        cutoff : float
+            The cutoff value for the score. Default is 0.0.
+        dim_x : int
+            The number of independent covariates. Default is 3.
+        a : float
+            Factor to control interaction of score and covariates to the outcome equation. Default is 0.0.
+        tau : float
+            Parameter to control the true effect in the generated data at the given cutoff. Default is 1.0.
+
+    Returns
+    -------
+    dict: A dictionary containing the generated data with keys:
+        'score' (np.ndarray): The running variable.
+        'X' (np.ndarray): The independent covariates.
+        'Y0' (np.ndarray): The potential outcomes without treatment.
+        'Y1' (np.ndarray): The potential outcomes with treatment.
+        'intended_treatment' (np.ndarray): The intended treatment assignment.
+    """
+
+    cutoff = kwargs.get('cutoff', 0.0)
+    dim_x = kwargs.get('dim_x', 3)
+    a = kwargs.get('a', 0.0)
+    tau = kwargs.get('tau', 1.0)
+
+    score = np.random.normal(size=n_obs)
+    # independent covariates
+    X = np.random.uniform(size=(n_obs, dim_x), low=-1, high=1)
+
+    # Create polynomials of covariates
+    if p == 0:
+        covs = np.zeros((n_obs, 1))
+    else:
+        covs = np.column_stack([Polynomial(np.arange(p + 1))(X[:, i]) for i in range(X.shape[1])])
+    g_cov = np.sum(covs, axis=1)
+
+    g0 = 0.1 * score**2
+    g1 = tau + 0.1 * score**2 - 0.5 * score**2 + a * np.sum(X, axis=1) * score
+
+    eps_scale = 0.2
+    # potential outcomes with independent errors
+    if not binary_outcome:
+        Y0 = g0 + g_cov + np.random.normal(size=n_obs, scale=eps_scale)
+        Y1 = g1 + g_cov + np.random.normal(size=n_obs, scale=eps_scale)
+    else:
+        p_Y0 = 1 / (1 + np.exp(-1.0 * (g0 + g_cov)))
+        p_Y1 = 1 / (1 + np.exp(-1.0 * (g1 + g_cov)))
+        Y0 = np.random.binomial(n=1, p=p_Y0, size=n_obs)
+        Y1 = np.random.binomial(n=1, p=p_Y1, size=n_obs)
+
+    intended_treatment = (score >= cutoff).astype(int)
+    if fuzzy:
+        prob = 0.3 + 0.4 * intended_treatment + 0.01 * score**2 - 0.02 * score**2 * intended_treatment + 0.2 * g_cov
+        prob = np.clip(prob, 0.0, 1.0)
+        D = np.random.binomial(n=1, p=prob, size=n_obs)
+    else:
+        D = intended_treatment
+
+    D = D.astype(int)
+    Y = Y0 * (1 - D) + Y1 * D
+
+    oracle_values = {
+        'Y0': Y0,
+        'Y1': Y1,
+    }
+    res_dict = {
+        'score': score,
+        'Y': Y,
+        'D': D,
+        'X': X,
+        'oracle_values': oracle_values
+    }
+    return res_dict