diff --git a/doubleml/did/datasets/dgp_did_cs_CS2021.py b/doubleml/did/datasets/dgp_did_cs_CS2021.py index 95119b948..08021270c 100644 --- a/doubleml/did/datasets/dgp_did_cs_CS2021.py +++ b/doubleml/did/datasets/dgp_did_cs_CS2021.py @@ -97,8 +97,8 @@ def make_did_cs_CS2021(n_obs=1000, dgp_type=1, include_never_treated=True, lambd P(G_i = g) = \\frac{1}{G} \\text{ for all } g - 7. Steps 1-6 generate panel data. To obtain repeated cross-sectional data, the number of generated indivials is increased - to `n_obs/lambda_t`, where `lambda_t` denotes the pobability to observe a unit at each time period (time constant). + 7. Steps 1-6 generate panel data. To obtain repeated cross-sectional data, the number of generated individuals is increased + to `n_obs/lambda_t`, where `lambda_t` denotes the probability to observe a unit at each time period (time constant). for each @@ -133,7 +133,8 @@ def make_did_cs_CS2021(n_obs=1000, dgp_type=1, include_never_treated=True, lambd Whether to include units that are never treated. lambda_t : float, default=0.5 - Probability of observing a unit at each time period. + Probability of observing a unit at each time period. Note that internally `n_obs/lambda_t` individuals are + generated of which only a fraction `lambda_t` is observed at each time period (see Step 7 in the DGP description). time_type : str, default="datetime" Type of time variable. Either "datetime" or "float". diff --git a/doubleml/did/did.py b/doubleml/did/did.py index 7a671993c..56bfe79c1 100644 --- a/doubleml/did/did.py +++ b/doubleml/did/did.py @@ -37,7 +37,7 @@ class DoubleMLDID(LinearScoreMixin, DoubleML): Default is ``5``. n_rep : int - Number of repetitons for the sample splitting. + Number of repetitions for the sample splitting. Default is ``1``. score : str @@ -47,7 +47,7 @@ class DoubleMLDID(LinearScoreMixin, DoubleML): Default is ``'observational'``. in_sample_normalization : bool - Indicates whether to use a sligthly different normalization from Sant'Anna and Zhao (2020). + Indicates whether to use a slightly different normalization from Sant'Anna and Zhao (2020). Default is ``True``. trimming_rule : str diff --git a/doubleml/did/did_binary.py b/doubleml/did/did_binary.py index 99ce7ef91..83e49cd03 100644 --- a/doubleml/did/did_binary.py +++ b/doubleml/did/did_binary.py @@ -70,7 +70,7 @@ class DoubleMLDIDBinary(LinearScoreMixin, DoubleML): Default is ``5``. n_rep : int - Number of repetitons for the sample splitting. + Number of repetitions for the sample splitting. Default is ``1``. score : str @@ -80,7 +80,7 @@ class DoubleMLDIDBinary(LinearScoreMixin, DoubleML): Default is ``'observational'``. in_sample_normalization : bool - Indicates whether to use a sligthly different normalization from Sant'Anna and Zhao (2020). + Indicates whether to use a slightly different normalization from Sant'Anna and Zhao (2020). Default is ``True``. trimming_rule : str diff --git a/doubleml/did/did_cs.py b/doubleml/did/did_cs.py index 8136f60c9..aa97996fe 100644 --- a/doubleml/did/did_cs.py +++ b/doubleml/did/did_cs.py @@ -37,7 +37,7 @@ class DoubleMLDIDCS(LinearScoreMixin, DoubleML): Default is ``5``. n_rep : int - Number of repetitons for the sample splitting. + Number of repetitions for the sample splitting. Default is ``1``. score : str @@ -47,7 +47,7 @@ class DoubleMLDIDCS(LinearScoreMixin, DoubleML): Default is ``'observational'``. in_sample_normalization : bool - Indicates whether to use a sligthly different normalization from Sant'Anna and Zhao (2020). + Indicates whether to use a slightly different normalization from Sant'Anna and Zhao (2020). Default is ``True``. trimming_rule : str diff --git a/doubleml/did/did_cs_binary.py b/doubleml/did/did_cs_binary.py index 73b9152fc..1754a8956 100644 --- a/doubleml/did/did_cs_binary.py +++ b/doubleml/did/did_cs_binary.py @@ -28,6 +28,78 @@ class DoubleMLDIDCSBinary(LinearScoreMixin, DoubleML): + """Double machine learning for difference-in-differences models with repeated cross sections (binary setting in terms of group and time + combinations). + + Parameters + ---------- + obj_dml_data : :class:`DoubleMLPanelData` object + The :class:`DoubleMLPanelData` object providing the data and specifying the variables for the causal model. + + g_value : int + The value indicating the treatment group (first period with treatment). + Default is ``None``. This implements the case for the smallest, non-zero value of G. + + t_value_pre : int + The value indicating the baseline pre-treatment period. + + t_value_eval : int + The value indicating the period for evaluation. + + ml_g : estimator implementing ``fit()`` and ``predict()`` + A machine learner implementing ``fit()`` and ``predict()`` methods (e.g. + :py:class:`sklearn.ensemble.RandomForestRegressor`) for the nuisance function :math:`g_0(d,X) = E[Y_1-Y_0|D=d, X]`. + For a binary outcome variable :math:`Y` (with values 0 and 1), a classifier implementing ``fit()`` and + ``predict_proba()`` can also be specified. If :py:func:`sklearn.base.is_classifier` returns ``True``, + ``predict_proba()`` is used otherwise ``predict()``. + + ml_m : classifier implementing ``fit()`` and ``predict_proba()`` + A machine learner implementing ``fit()`` and ``predict_proba()`` methods (e.g. + :py:class:`sklearn.ensemble.RandomForestClassifier`) for the nuisance function :math:`m_0(X) = E[D=1|X]`. + Only relevant for ``score='observational'``. + + control_group : str + Specifies the control group. Either ``'never_treated'`` or ``'not_yet_treated'``. + Default is ``'never_treated'``. + + anticipation_periods : int + Number of anticipation periods. Default is ``0``. + + n_folds : int + Number of folds. + Default is ``5``. + + n_rep : int + Number of repetitions for the sample splitting. + Default is ``1``. + + score : str + A str (``'observational'`` or ``'experimental'``) specifying the score function. + The ``'experimental'`` scores refers to an A/B setting, where the treatment is independent + from the pretreatment covariates. + Default is ``'observational'``. + + in_sample_normalization : bool + Indicates whether to use a slightly different normalization from Sant'Anna and Zhao (2020). + Default is ``True``. + + trimming_rule : str + A str (``'truncate'`` is the only choice) specifying the trimming approach. + Default is ``'truncate'``. + + trimming_threshold : float + The threshold used for trimming. + Default is ``1e-2``. + + draw_sample_splitting : bool + Indicates whether the sample splitting should be drawn during initialization of the object. + Default is ``True``. + + print_periods : bool + Indicates whether to print information about the evaluated periods. + Default is ``False``. + + """ def __init__( self, diff --git a/doubleml/irm/apo.py b/doubleml/irm/apo.py index e8c75172c..9fcad8763 100644 --- a/doubleml/irm/apo.py +++ b/doubleml/irm/apo.py @@ -46,7 +46,7 @@ class DoubleMLAPO(LinearScoreMixin, DoubleML): Default is ``5``. n_rep : int - Number of repetitons for the sample splitting. + Number of repetitions for the sample splitting. Default is ``1``. score : str or callable diff --git a/doubleml/irm/cvar.py b/doubleml/irm/cvar.py index d2aeaced6..29d78f15e 100644 --- a/doubleml/irm/cvar.py +++ b/doubleml/irm/cvar.py @@ -54,7 +54,7 @@ class DoubleMLCVAR(LinearScoreMixin, DoubleML): Default is ``5``. n_rep : int - Number of repetitons for the sample splitting. + Number of repetitions for the sample splitting. Default is ``1``. score : str diff --git a/doubleml/irm/iivm.py b/doubleml/irm/iivm.py index b3cc11e78..73495fd7d 100644 --- a/doubleml/irm/iivm.py +++ b/doubleml/irm/iivm.py @@ -45,7 +45,7 @@ class DoubleMLIIVM(LinearScoreMixin, DoubleML): Default is ``5``. n_rep : int - Number of repetitons for the sample splitting. + Number of repetitions for the sample splitting. Default is ``1``. score : str or callable diff --git a/doubleml/irm/irm.py b/doubleml/irm/irm.py index 9bf5ed35f..76e955f98 100644 --- a/doubleml/irm/irm.py +++ b/doubleml/irm/irm.py @@ -47,7 +47,7 @@ class DoubleMLIRM(LinearScoreMixin, DoubleML): Default is ``5``. n_rep : int - Number of repetitons for the sample splitting. + Number of repetitions for the sample splitting. Default is ``1``. score : str or callable diff --git a/doubleml/irm/lpq.py b/doubleml/irm/lpq.py index c98e8fa2d..4b2377eea 100644 --- a/doubleml/irm/lpq.py +++ b/doubleml/irm/lpq.py @@ -49,7 +49,7 @@ class DoubleMLLPQ(NonLinearScoreMixin, DoubleML): Default is ``5``. n_rep : int - Number of repetitons for the sample splitting. + Number of repetitions for the sample splitting. Default is ``1``. score : str diff --git a/doubleml/irm/pq.py b/doubleml/irm/pq.py index f64dc4719..7f40d27db 100644 --- a/doubleml/irm/pq.py +++ b/doubleml/irm/pq.py @@ -56,7 +56,7 @@ class DoubleMLPQ(NonLinearScoreMixin, DoubleML): Default is ``5``. n_rep : int - Number of repetitons for the sample splitting. + Number of repetitions for the sample splitting. Default is ``1``. score : str diff --git a/doubleml/irm/qte.py b/doubleml/irm/qte.py index 68b91a9a7..9f617e3e9 100644 --- a/doubleml/irm/qte.py +++ b/doubleml/irm/qte.py @@ -39,7 +39,7 @@ class DoubleMLQTE: Default is ``5``. n_rep : int - Number of repetitons for the sample splitting. + Number of repetitions for the sample splitting. Default is ``1``. score : str diff --git a/doubleml/irm/ssm.py b/doubleml/irm/ssm.py index c84b326d7..e7e5d83c5 100644 --- a/doubleml/irm/ssm.py +++ b/doubleml/irm/ssm.py @@ -39,7 +39,7 @@ class DoubleMLSSM(LinearScoreMixin, DoubleML): Default is ``5``. n_rep : int - Number of repetitons for the sample splitting. + Number of repetitions for the sample splitting. Default is ``1``. score : str or callable diff --git a/doubleml/plm/pliv.py b/doubleml/plm/pliv.py index ba0226889..fdf4e28d1 100644 --- a/doubleml/plm/pliv.py +++ b/doubleml/plm/pliv.py @@ -45,7 +45,7 @@ class DoubleMLPLIV(LinearScoreMixin, DoubleML): Default is ``5``. n_rep : int - Number of repetitons for the sample splitting. + Number of repetitions for the sample splitting. Default is ``1``. score : str or callable diff --git a/doubleml/plm/plr.py b/doubleml/plm/plr.py index a81bac48c..30ad763eb 100644 --- a/doubleml/plm/plr.py +++ b/doubleml/plm/plr.py @@ -44,7 +44,7 @@ class DoubleMLPLR(LinearScoreMixin, DoubleML): Default is ``5``. n_rep : int - Number of repetitons for the sample splitting. + Number of repetitions for the sample splitting. Default is ``1``. score : str or callable diff --git a/doubleml/rdd/rdd.py b/doubleml/rdd/rdd.py index 858ae5ed1..be1cd7973 100644 --- a/doubleml/rdd/rdd.py +++ b/doubleml/rdd/rdd.py @@ -50,7 +50,7 @@ class RDFlex: Default is ``5``. n_rep : int - Number of repetitons for the sample splitting. + Number of repetitions for the sample splitting. Default is ``1``. cutoff : float or int