diff --git a/DESCRIPTION b/DESCRIPTION
index 31d13a9..5055a80 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
Package: medoutcon
Title: Efficient Natural and Interventional Causal Mediation Analysis
-Version: 0.2.2
+Version: 0.2.3
Authors@R: c(
person("Nima", "Hejazi", email = "nh@nimahejazi.org",
role = c("aut", "cre", "cph"),
diff --git a/LICENSE b/LICENSE
index fc423be..0f8023e 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,2 +1,2 @@
-YEAR: 2020-2022
+YEAR: 2020-2024
COPYRIGHT HOLDER: Nima S. Hejazi
diff --git a/NEWS.md b/NEWS.md
index 38e6044..dc96875 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,21 @@
+# medoutcon 0.2.3
+
+* Added a new named argument `cv_stratify` to `est_onestep()` and `est_tml()`
+ and to the `estimator_args` list-argument in `medoutcon()`, which allows for
+ stratified folds to be generated for cross-fitting (by passing these to the
+ `strata_ids` argument of `make_folds()` from the `origami` package). This is
+ also triggered by an override in `est_onestep()` and `est_tml()` when the
+ proportion of detected cases is less than 0.1, a heuristic for rare outcomes.
+* Increased the default number of folds for cross-fitting from 5 to 10, setting
+ `cv_folds = 10L` in named arguments to `est_onestep()` and `est_tml()` and to
+ the `estimator_args` list-argument in `medoutcon()`.
+* Changed default propensity score truncation bounds specified in `g_bounds` to
+ `c(0.005, 0.995)` from `c(0.01, 0.99)` (in v0.22), based on sanity checks and
+ manual experimentation.
+* Wrapped instances of `sl3_Task()` in which `outcome_type = "continuous"` is
+ specified in `suppressWarnings()` to sink warnings when the outcome variable
+ for a given nuisance estimation task fails `sl3`'s check for continuous-ness.
+
# medoutcon 0.2.2
* Change iterative targeting procedures in `est_tml()` to use `glm2::glm2`
diff --git a/R/estimators.R b/R/estimators.R
index 6cb49a6..d854261 100644
--- a/R/estimators.R
+++ b/R/estimators.R
@@ -81,7 +81,7 @@ cv_eif <- function(fold,
effect_type = c("interventional", "natural"),
w_names,
m_names,
- g_bounds = c(0.01, 0.99)) {
+ g_bounds = c(0.005, 0.995)) {
# make training and validation data
train_data <- origami::training(data_in)
valid_data <- origami::validation(data_in)
@@ -160,9 +160,11 @@ cv_eif <- function(fold,
g_star <- g_out$treat_est_valid$treat_pred_A_star[valid_data$R == 1]
h_prime <- h_out$treat_est_valid$treat_pred_A_prime
g_prime <- g_out$treat_est_valid$treat_pred_A_prime[valid_data$R == 1]
- q_prime_Z_one <- q_out$moc_est_valid_Z_one$moc_pred_A_prime[valid_data$R == 1]
+ q_prime_Z_one <-
+ q_out$moc_est_valid_Z_one$moc_pred_A_prime[valid_data$R == 1]
r_prime_Z_one <- r_out$moc_est_valid_Z_one$moc_pred_A_prime
- q_prime_Z_natural <- q_out$moc_est_valid_Z_natural$moc_pred_A_prime[valid_data$R == 1]
+ q_prime_Z_natural <-
+ q_out$moc_est_valid_Z_natural$moc_pred_A_prime[valid_data$R == 1]
r_prime_Z_natural <- r_out$moc_est_valid_Z_natural$moc_pred_A_prime
# need pseudo-outcome regressions with intervention set to a contrast
@@ -208,12 +210,14 @@ cv_eif <- function(fold,
# predict u(z, a', w) using intervened data with treatment set A = a'
# NOTE: here, obs_weights should not include two_phase_weights (?)
- u_task_valid_z_interv <- sl3::sl3_Task$new(
- data = valid_data_z_interv,
- weights = "obs_weights",
- covariates = c("Z", "A", w_names),
- outcome = "U_pseudo",
- outcome_type = "continuous"
+ suppressWarnings(
+ u_task_valid_z_interv <- sl3::sl3_Task$new(
+ data = valid_data_z_interv,
+ weights = "obs_weights",
+ covariates = c("Z", "A", w_names),
+ outcome = "U_pseudo",
+ outcome_type = "continuous"
+ )
)
# return partial pseudo-outcome for v nuisance regression
@@ -352,7 +356,7 @@ two_phase_eif <- function(R,
# for each index in R with R == 0, add a zero at the same index in eif
new_eif <- rep(NA, length(R))
eif_idx <- 1
- for (idx in seq_len(length(R))) {
+ for (idx in seq_along(R)) {
if (R[idx] == 1) {
new_eif[idx] <- eif[eif_idx]
eif_idx <- eif_idx + 1
@@ -443,6 +447,16 @@ two_phase_eif <- function(R,
#' conditions on the one-step estimator to be relaxed. For compatibility with
#' \code{\link[origami]{make_folds}}, this value specified must be greater
#' than or equal to 2; the default is to create 5 folds.
+#' @param cv_strat A \code{logical} atomic vector indicating whether V-fold
+#' cross-validation should stratify the folds based on the outcome variable.
+#' If \code{TRUE}, the folds are stratified by passing the outcome variable to
+#' the \code{strata_ids} argument of \code{\link[origami]{make_folds}}. While
+#' the default is \code{FALSE}, an override is triggered when the incidence of
+#' the binary outcome variable falls below the tolerance in \code{strat_pmin}.
+#' @param strat_pmin A \code{numeric} atomic vector indicating a tolerance for
+#' the minimum proportion of cases (for a binary outcome variable) below which
+#' stratified V-fold cross-validation is invoked if \code{cv_strat} is set to
+#' \code{TRUE} (default is \code{FALSE}). The default tolerance is 0.1.
#'
#' @importFrom assertthat assert_that
#' @importFrom stats var weighted.mean
@@ -462,18 +476,35 @@ est_onestep <- function(data,
w_names,
m_names,
y_bounds,
- g_bounds = c(0.01, 0.99),
+ g_bounds = c(0.005, 0.995),
effect_type = c("interventional", "natural"),
svy_weights = NULL,
- cv_folds = 5L) {
+ cv_folds = 10L,
+ cv_strat = FALSE,
+ strat_pmin = 0.1) {
# make sure that more than one fold is specified
assertthat::assert_that(cv_folds > 1L)
# create cross-validation folds
- folds <- origami::make_folds(data,
- fold_fun = origami::folds_vfold,
- V = cv_folds
- )
+ if (cv_strat && data[, mean(Y) <= strat_pmin]) {
+ # check that outcome is binary for stratified V-fold cross-validation
+ assertthat::assert_that(data[, all(unique(Y) %in% c(0, 1))])
+
+ # if outcome is binary and rare, use stratified V-fold cross-validation
+ folds <- origami::make_folds(
+ data,
+ fold_fun = origami::folds_vfold,
+ V = cv_folds,
+ strata_ids = data$Y
+ )
+ } else {
+ # just use standard V-fold cross-validation
+ folds <- origami::make_folds(
+ data,
+ fold_fun = origami::folds_vfold,
+ V = cv_folds
+ )
+ }
# estimate the EIF on a per-fold basis
cv_eif_results <- origami::cross_validate(
@@ -599,6 +630,16 @@ est_onestep <- function(data,
#' conditions on the TML estimator to be relaxed. Note: for compatibility with
#' \code{\link[origami]{make_folds}}, this value must be greater than or
#' equal to 2; the default is to create 10 folds.
+#' @param cv_strat A \code{logical} atomic vector indicating whether V-fold
+#' cross-validation should stratify the folds based on the outcome variable.
+#' If \code{TRUE}, the folds are stratified by passing the outcome variable to
+#' the \code{strata_ids} argument of \code{\link[origami]{make_folds}}. While
+#' the default is \code{FALSE}, an override is triggered when the incidence of
+#' the binary outcome variable falls below the tolerance in \code{strat_pmin}.
+#' @param strat_pmin A \code{numeric} atomic vector indicating a tolerance for
+#' the minimum proportion of cases (for a binary outcome variable) below which
+#' stratified V-fold cross-validation is invoked if \code{cv_strat} is set to
+#' \code{TRUE} (default is \code{FALSE}). The default tolerance is 0.1.
#' @param max_iter A \code{numeric} integer giving the maximum number of steps
#' to be taken for the iterative procedure to construct a TML estimator.
#' @param tiltmod_tol A \code{numeric} indicating the maximum step size to be
@@ -626,20 +667,37 @@ est_tml <- function(data,
w_names,
m_names,
y_bounds,
- g_bounds = c(0.01, 0.99),
+ g_bounds = c(0.005, 0.95),
effect_type = c("interventional", "natural"),
svy_weights = NULL,
- cv_folds = 5L,
- max_iter = 5L,
+ cv_folds = 10L,
+ cv_strat = FALSE,
+ strat_pmin = 0.1,
+ max_iter = 10L,
tiltmod_tol = 5) {
# make sure that more than one fold is specified
assertthat::assert_that(cv_folds > 1L)
# create cross-validation folds
- folds <- origami::make_folds(data,
- fold_fun = origami::folds_vfold,
- V = cv_folds
- )
+ if (cv_strat && data[, mean(Y) <= strat_pmin]) {
+ # check that outcome is binary for stratified V-fold cross-validation
+ assertthat::assert_that(data[, all(unique(Y) %in% c(0, 1))])
+
+ # if outcome is binary and rare, use stratified V-fold cross-validation
+ folds <- origami::make_folds(
+ data,
+ fold_fun = origami::folds_vfold,
+ V = cv_folds,
+ strata_ids = data$Y
+ )
+ } else {
+ # just use standard V-fold cross-validation
+ folds <- origami::make_folds(
+ data,
+ fold_fun = origami::folds_vfold,
+ V = cv_folds
+ )
+ }
# perform the cv_eif procedure on a per-fold basis
cv_eif_results <- origami::cross_validate(
diff --git a/R/fit_mechanisms.R b/R/fit_mechanisms.R
index f6ba2b8..8239092 100644
--- a/R/fit_mechanisms.R
+++ b/R/fit_mechanisms.R
@@ -589,7 +589,8 @@ fit_nuisance_u <- function(train_data,
g_star <- g_out$treat_est_train$treat_pred_A_star[train_data$R == 1]
h_prime <- h_out$treat_est_train$treat_pred_A_prime
g_prime <- g_out$treat_est_train$treat_pred_A_prime[train_data$R == 1]
- q_prime_Z_natural <- q_out$moc_est_train_Z_natural$moc_pred_A_prime[train_data$R == 1]
+ q_prime_Z_natural <-
+ q_out$moc_est_train_Z_natural$moc_pred_A_prime[train_data$R == 1]
r_prime_Z_natural <- r_out$moc_est_train_Z_natural$moc_pred_A_prime
# remove observations that were not sampled in second stage
@@ -618,12 +619,14 @@ fit_nuisance_u <- function(train_data,
w_names, "A", "Z", "U_pseudo",
"obs_weights"
))
- u_task_train <- sl3::sl3_Task$new(
- data = u_data_train,
- weights = "obs_weights",
- covariates = c("Z", "A", w_names),
- outcome = "U_pseudo",
- outcome_type = "continuous"
+ suppressWarnings(
+ u_task_train <- sl3::sl3_Task$new(
+ data = u_data_train,
+ weights = "obs_weights",
+ covariates = c("Z", "A", w_names),
+ outcome = "U_pseudo",
+ outcome_type = "continuous"
+ )
)
## fit model for nuisance parameter regression on training data
@@ -640,12 +643,14 @@ fit_nuisance_u <- function(train_data,
w_names, "A", "Z", "U_pseudo",
"obs_weights"
))
- u_task_valid <- sl3::sl3_Task$new(
- data = u_data_valid,
- weights = "obs_weights",
- covariates = c("Z", "A", w_names),
- outcome = "U_pseudo",
- outcome_type = "continuous"
+ suppressWarnings(
+ u_task_valid <- sl3::sl3_Task$new(
+ data = u_data_valid,
+ weights = "obs_weights",
+ covariates = c("Z", "A", w_names),
+ outcome = "U_pseudo",
+ outcome_type = "continuous"
+ )
)
## predict from nuisance parameter regression on validation and training data
@@ -702,8 +707,10 @@ fit_nuisance_v <- function(train_data,
m_names,
w_names) {
## extract nuisance estimates necessary for this routrine
- q_train_prime_Z_one <- q_out$moc_est_train_Z_one$moc_pred_A_prime[train_data$R == 1]
- q_valid_prime_Z_one <- q_out$moc_est_valid_Z_one$moc_pred_A_prime[valid_data$R == 1]
+ q_train_prime_Z_one <-
+ q_out$moc_est_train_Z_one$moc_pred_A_prime[train_data$R == 1]
+ q_valid_prime_Z_one <-
+ q_out$moc_est_valid_Z_one$moc_pred_A_prime[valid_data$R == 1]
# remove observations that were not sampled in second stage
train_data <- train_data[R == 1, ]
@@ -799,24 +806,28 @@ fit_nuisance_v <- function(train_data,
## build regression tasks for training and validation sets
train_data[, V_pseudo := v_pseudo_train]
- v_task_train <- sl3::sl3_Task$new(
- data = train_data,
- weights = "obs_weights", # NOTE: should not include two_phase_weights
- covariates = c("A", w_names),
- outcome = "V_pseudo",
- outcome_type = "continuous"
+ suppressWarnings(
+ v_task_train <- sl3::sl3_Task$new(
+ data = train_data,
+ weights = "obs_weights", # NOTE: should not include two_phase_weights
+ covariates = c("A", w_names),
+ outcome = "V_pseudo",
+ outcome_type = "continuous"
+ )
)
# NOTE: independent implementation from ID sets A to a* as done below
valid_data[, `:=`(
V_pseudo = v_pseudo_valid,
A = contrast[2]
)]
- v_task_valid <- sl3::sl3_Task$new(
- data = valid_data,
- weights = "obs_weights", # NOTE: should not include two_phase_weights
- covariates = c("A", w_names),
- outcome = "V_pseudo",
- outcome_type = "continuous"
+ suppressWarnings(
+ v_task_valid <- sl3::sl3_Task$new(
+ data = valid_data,
+ weights = "obs_weights", # NOTE: should not include two_phase_weights
+ covariates = c("A", w_names),
+ outcome = "V_pseudo",
+ outcome_type = "continuous"
+ )
)
## fit regression model for v on training task, get predictions on validation
@@ -911,8 +922,10 @@ fit_nuisance_d <- function(train_data,
g_prime <- g_out$treat_est_train$treat_pred_A_prime[train_data$R == 1]
u_prime <- u_out$u_train_pred
v_star <- v_out$v_train_pred
- q_prime_Z_one <- q_out$moc_est_train_Z_one$moc_pred_A_prime[train_data$R == 1]
- q_prime_Z_natural <- q_out$moc_est_train_Z_natural$moc_pred_A_prime[train_data$R == 1]
+ q_prime_Z_one <-
+ q_out$moc_est_train_Z_one$moc_pred_A_prime[train_data$R == 1]
+ q_prime_Z_natural <-
+ q_out$moc_est_train_Z_natural$moc_pred_A_prime[train_data$R == 1]
r_prime_Z_natural <- r_out$moc_est_train_Z_natural$moc_pred_A_prime
# NOTE: assuming Z in {0,1}; other cases not supported yet
@@ -926,12 +939,14 @@ fit_nuisance_d <- function(train_data,
)]
# predict u(z, a', w) using intervened data with treatment set A = a'
- u_task_train_z_interv <- sl3::sl3_Task$new(
- data = train_data_z_interv,
- weights = "obs_weights", # NOTE: should not include two_phase_weights
- covariates = c("Z", "A", w_names),
- outcome = "U_pseudo",
- outcome_type = "continuous"
+ suppressWarnings(
+ u_task_train_z_interv <- sl3::sl3_Task$new(
+ data = train_data_z_interv,
+ weights = "obs_weights", # NOTE: should not include two_phase_weights
+ covariates = c("Z", "A", w_names),
+ outcome = "U_pseudo",
+ outcome_type = "continuous"
+ )
)
# return partial pseudo-outcome for v nuisance regression
@@ -966,12 +981,14 @@ fit_nuisance_d <- function(train_data,
# generate the sl3 task
# NOTE: Purposefully not adding two-phase sampling weights
- d_task_train <- sl3::sl3_Task$new(
- data = eif_data_train,
- weights = "obs_weights", # NOTE: should not include two_phase_weights
- covariates = c(w_names, "A", "Z", "Y"),
- outcome = "eif",
- outcome_type = "continuous"
+ suppressWarnings(
+ d_task_train <- sl3::sl3_Task$new(
+ data = eif_data_train,
+ weights = "obs_weights", # NOTE: should not include two_phase_weights
+ covariates = c(w_names, "A", "Z", "Y"),
+ outcome = "eif",
+ outcome_type = "continuous"
+ )
)
## fit model for nuisance parameter regression on training data
diff --git a/R/medoutcon.R b/R/medoutcon.R
index a5a970e..4c43df6 100644
--- a/R/medoutcon.R
+++ b/R/medoutcon.R
@@ -24,9 +24,9 @@
#' weights corresponding to the inverse probability of the mediator being
#' measured. Defaults to a vector of ones.
#' @param effect A \code{character} indicating whether to compute the direct or
-#' the indirect effect as discussed in Site built with pkgdown 2.0.6. Site built with pkgdown 2.0.7. While we recommend the use of a Super Learner ensemble model like the
one constructed above in practice, such a library will be too
@@ -443,58 +445,62 @@ From the output of the summary method, we note that the one-step
estimate of the interventional direct effect \(\hat{\theta}_{\text{os}}^{\text{DE}}\) is
-0.015, with 95% confidence interval [-0.094, 0.123].Page not found (404)
diff --git a/docs/CONTRIBUTING.html b/docs/CONTRIBUTING.html
index 7f7a227..b01126d 100644
--- a/docs/CONTRIBUTING.html
+++ b/docs/CONTRIBUTING.html
@@ -17,7 +17,7 @@
Pull requests
-
License
-YEAR: 2020-2022
+
YEAR: 2020-2024
COPYRIGHT HOLDER: Nima S. Hejazi
@@ -75,7 +75,7 @@ License
diff --git a/docs/LICENSE.html b/docs/LICENSE.html
index 26f1f61..b10d36b 100644
--- a/docs/LICENSE.html
+++ b/docs/LICENSE.html
@@ -17,7 +17,7 @@
@@ -79,7 +79,7 @@ MIT License
diff --git a/docs/articles/index.html b/docs/articles/index.html
index f4ff169..2920a16 100644
--- a/docs/articles/index.html
+++ b/docs/articles/index.html
@@ -17,7 +17,7 @@
@@ -72,7 +72,7 @@ All vignettes
diff --git a/docs/articles/intro_medoutcon.html b/docs/articles/intro_medoutcon.html
index 2ce69dc..8373de6 100644
--- a/docs/articles/intro_medoutcon.html
+++ b/docs/articles/intro_medoutcon.html
@@ -33,7 +33,7 @@
@@ -84,7 +84,7 @@
Nima Hejazi, Iván Díaz, and Kara Rudolph
- 2022-10-12
+ 2024-03-17
Source: vignettes/intro_medoutcon.Rmd
intro_medoutcon.Rmd
Setting up the data exampley <- rbinom(n_obs, 1, plogis(1 / (rowSums(w) - z + a + m)))
## construct output
- dat <- as.data.table(cbind(w = w, a = a, z = z, m = m, y = y))
- setnames(dat, c(w_names, "A", "Z", m_names, "Y"))
+ dat <- as.data.table(cbind(w = w, a = a, z = z, m = m, y = y))
+ setnames(dat, c(w_names, "A", "Z", m_names, "Y"))
return(dat)
}
@@ -392,8 +392,10 @@
Ensemble learning of nuisance f
hal_bounded_lrnr <- Pipeline$new(hal_gaussian_lrnr, bound_lrnr)
# create learner library and instantiate super learner ensemble
-lrnr_lib <- Stack$new(mean_lrnr, fglm_lrnr, enet_lrnr, lasso_lrnr,
- rf_lrnr, hal_bounded_lrnr)
+lrnr_lib <- Stack$new(
+ mean_lrnr, fglm_lrnr, enet_lrnr, lasso_lrnr,
+ rf_lrnr, hal_bounded_lrnr
+)
sl_lrnr <- Lrnr_sl$new(learners = lrnr_lib, metalearner = Lrnr_nnls$new())
Estimating the direct effect
# compute one-step estimate of the interventional direct effect
-os_de <- medoutcon(W = example_data[, ..w_names],
- A = example_data$A,
- Z = example_data$Z,
- M = example_data[, ..m_names],
- Y = example_data$Y,
- g_learners = sl_lrnr,
- h_learners = sl_lrnr,
- b_learners = sl_lrnr,
- q_learners = sl_lrnr,
- r_learners = sl_lrnr,
- effect = "direct",
- estimator = "onestep",
- estimator_args = list(cv_folds = 2))
+os_de <- medoutcon(
+ W = example_data[, ..w_names],
+ A = example_data$A,
+ Z = example_data$Z,
+ M = example_data[, ..m_names],
+ Y = example_data$Y,
+ g_learners = sl_lrnr,
+ h_learners = sl_lrnr,
+ b_learners = sl_lrnr,
+ q_learners = sl_lrnr,
+ r_learners = sl_lrnr,
+ effect = "direct",
+ estimator = "onestep",
+ estimator_args = list(cv_folds = 2)
+)
summary(os_de)
+## lwr_ci param_est upr_ci var_est eif_mean estimator param
+## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
+## 1 -0.0862 0.0293 0.145 0.00347 3.38e-17 onestep direct_interventional
## # A tibble: 1 × 7
-## lwr_ci param_est upr_ci var_est eif_mean estimator param
-## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
-## 1 -0.0935 0.0147 0.123 0.00305 -3.53e-17 onestep direct_interventional
Next, let’s compare the one-step estimate to the TML estimate.
Analogous to the case of the one-step estimator, the TML estimator can
be evaluated via a single call to the medoutcon
function:
# compute targeted minimum loss estimate of the interventional direct effect
-tmle_de <- medoutcon(W = example_data[, ..w_names],
- A = example_data$A,
- Z = example_data$Z,
- M = example_data[, ..m_names],
- Y = example_data$Y,
- g_learners = sl_lrnr,
- h_learners = sl_lrnr,
- b_learners = sl_lrnr,
- q_learners = sl_lrnr,
- r_learners = sl_lrnr,
- effect = "direct",
- estimator = "tmle",
- estimator_args = list(cv_folds = 2, max_iter = 5))
+tmle_de <- medoutcon(
+ W = example_data[, ..w_names],
+ A = example_data$A,
+ Z = example_data$Z,
+ M = example_data[, ..m_names],
+ Y = example_data$Y,
+ g_learners = sl_lrnr,
+ h_learners = sl_lrnr,
+ b_learners = sl_lrnr,
+ q_learners = sl_lrnr,
+ r_learners = sl_lrnr,
+ effect = "direct",
+ estimator = "tmle",
+ estimator_args = list(cv_folds = 2, max_iter = 5)
+)
summary(tmle_de)
## # A tibble: 1 × 7
## lwr_ci param_est upr_ci var_est eif_mean estimator param
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
-## 1 -0.0757 0.0247 0.125 0.00262 -0.00234 tmle direct_interventional
+## 1 -0.0941 0.0197 0.134 0.00337 0.00354 tmle direct_interventional
From the output of the summary method, we note that the TML estimate of the interventional direct effect \(\hat{\theta}_{\text{tmle}}^{\text{DE}}\) is -0.025, with 95% confidence interval [-0.076, 0.125]. Here, we recall -that the TML estimator generally exhibits better finite-sample -performance than the one-step estimator (van der -Laan and Rose 2011, 2018), so the TML estimate is likely to be -more reliable in our modest sample size of \(n +0.02, with 95% confidence interval [-0.094, 0.134]. Here, we recall that +the TML estimator generally exhibits better finite-sample performance +than the one-step estimator (van der Laan and +Rose 2011, 2018), so the TML estimate is likely to be more +reliable in our modest sample size of \(n =\) 500.
# compute one-step estimate of the interventional indirect effect
-os_ie <- medoutcon(W = example_data[, ..w_names],
- A = example_data$A,
- Z = example_data$Z,
- M = example_data[, ..m_names],
- Y = example_data$Y,
- g_learners = sl_lrnr,
- h_learners = sl_lrnr,
- b_learners = sl_lrnr,
- q_learners = sl_lrnr,
- r_learners = sl_lrnr,
- effect = "indirect",
- estimator = "onestep")
+os_ie <- medoutcon(
+ W = example_data[, ..w_names],
+ A = example_data$A,
+ Z = example_data$Z,
+ M = example_data[, ..m_names],
+ Y = example_data$Y,
+ g_learners = sl_lrnr,
+ h_learners = sl_lrnr,
+ b_learners = sl_lrnr,
+ q_learners = sl_lrnr,
+ r_learners = sl_lrnr,
+ effect = "indirect",
+ estimator = "onestep"
+)
summary(os_ie)
## # A tibble: 1 × 7
## lwr_ci param_est upr_ci var_est eif_mean estimator param
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
-## 1 -0.241 -0.161 -0.0813 0.00167 -3.12e-17 onestep indirect_interventional
+## 1 -0.206 -0.143 -0.0802 0.00103 -5.42e-17 onestep indirect_interventional
From the output of the summary method, we note that the one-step estimate of the interventional indirect effect \(\hat{\theta}_{\text{os}}^{\text{IE}}\) is --0.161, with 95% confidence interval [-0.241, -0.081].
+-0.143, with 95% confidence interval [-0.206, -0.08].As before, let’s compare the one-step estimate to the TML estimate.
Analogous to the case of the one-step estimator, the TML estimator can
be evaluated via a single call to the medoutcon
function,
as demonstrated below
# compute targeted minimum loss estimate of the interventional indirect effect
-tmle_ie <- medoutcon(W = example_data[, ..w_names],
- A = example_data$A,
- Z = example_data$Z,
- M = example_data[, ..m_names],
- Y = example_data$Y,
- g_learners = sl_lrnr,
- h_learners = sl_lrnr,
- b_learners = sl_lrnr,
- q_learners = sl_lrnr,
- r_learners = sl_lrnr,
- effect = "indirect",
- estimator = "tmle")
+tmle_ie <- medoutcon(
+ W = example_data[, ..w_names],
+ A = example_data$A,
+ Z = example_data$Z,
+ M = example_data[, ..m_names],
+ Y = example_data$Y,
+ g_learners = sl_lrnr,
+ h_learners = sl_lrnr,
+ b_learners = sl_lrnr,
+ q_learners = sl_lrnr,
+ r_learners = sl_lrnr,
+ effect = "indirect",
+ estimator = "tmle"
+)
summary(tmle_ie)
## # A tibble: 1 × 7
## lwr_ci param_est upr_ci var_est eif_mean estimator param
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
-## 1 -0.204 -0.136 -0.0671 0.00122 0.000383 tmle indirect_interventional
+## 1 -0.197 -0.121 -0.0451 0.00151 -0.00834 tmle indirect_interventional
From the output of the summary method, we note that the TML estimate of the interventional indirect effect \(\hat{\theta}_{\text{tmle}}^{\text{IE}}\) is --0.136, with 95% confidence interval [-0.204, -0.067]. As before, the +-0.121, with 95% confidence interval [-0.197, -0.045]. As before, the TML estimator provides better finite-sample performance than the one-step estimator, so it may be preferred in this example.
Site built with pkgdown 2.0.6.
+Site built with pkgdown 2.0.7.
Hejazi NS, Díaz I, Rudolph KE (2022). +
Hejazi NS, Díaz I, Rudolph KE (2024). medoutcon: Efficient causal mediation analysis for the natural and interventional effects. -doi:10.5281/zenodo.5809519, R package version 0.2.0, https://github.com/nhejazi/medoutcon. +doi:10.5281/zenodo.5809519, R package version 0.2.3, https://github.com/nhejazi/medoutcon.
@Manual{, title = {{medoutcon}: Efficient causal mediation analysis for the natural and interventional effects}, author = {Nima S Hejazi and Iván Díaz and Kara E Rudolph}, - year = {2022}, - note = {R package version 0.2.0}, + year = {2024}, + note = {R package version 0.2.3}, doi = {10.5281/zenodo.5809519}, url = {https://github.com/nhejazi/medoutcon}, }@@ -137,7 +137,7 @@
medoutcon
?
The medoutcon
R package provides facilities for efficient estimation of path-specific (in)direct effects that measure the impact of a treatment variable A on an outcome variable Y, through a direct path (through A only) and an indirect path (through a set of mediators M only). In the presence of an intermediate mediator-outcome confounder Z, itself affected by the treatment A, these correspond to the interventional (in)direct effects described by Dı́az et al. (2020), though similar (yet less general) effect definitions and/or estimation strategies have appeared in VanderWeele, Vansteelandt, and Robins (2014), Rudolph et al. (2017), Zheng and van der Laan (2017), and Benkeser and Ran (2021). When no intermediate confounders are present, these effect definitions simplify to the well-studied natural (in)direct effects, and our estimators are analogs of those formulated by Zheng and van der Laan (2012). Both an efficient one-step bias-corrected estimator with cross-fitting (Pfanzagl and Wefelmeyer 1985; Zheng and van der Laan 2011; Chernozhukov et al. 2018) and a cross-validated targeted minimum loss estimator (TMLE) (van der Laan and Rose 2011; Zheng and van der Laan 2011) are made available. medoutcon
integrates with the sl3
R package (Coyle et al. 2021) to leverage statistical machine learning in the estimation procedure.
The medoutcon
R package provides facilities for efficient estimation of path-specific (in)direct effects that measure the impact of a treatment variable A on an outcome variable Y, through a direct path (through A only) and an indirect path (through a set of mediators M only). In the presence of an intermediate mediator-outcome confounder Z, itself affected by the treatment A, these correspond to the interventional (in)direct effects described by Dı́az et al. (2020), though similar (yet less general) effect definitions and/or estimation strategies have appeared in @
vanderweele2014effect, Rudolph et al. (2017), Zheng and van der Laan (2017), and Benkeser and Ran (2021). When no intermediate confounders are present, these effect definitions simplify to the well-studied natural (in)direct effects, and our estimators are analogs of those formulated by Zheng and van der Laan (2012). Both an efficient one-step bias-corrected estimator with cross-fitting (Pfanzagl and Wefelmeyer 1985; Zheng and van der Laan 2011; Chernozhukov et al. 2018) and a cross-validated targeted minimum loss estimator (TMLE) (van der Laan and Rose 2011; Zheng and van der Laan 2011) are made available. medoutcon
integrates with the sl3
R package (Coyle et al. 2021) to leverage statistical machine learning in the estimation procedure.
library(data.table)
library(tidyverse)
-#> ── Attaching packages ─────────────────────────────────────────────────────── tidyverse 1.3.2 ──
-#> ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
-#> ✔ tibble 3.1.8 ✔ dplyr 1.0.10
-#> ✔ tidyr 1.2.1 ✔ stringr 1.4.1
-#> ✔ readr 2.1.2 ✔ forcats 0.5.2
-#> ── Conflicts ────────────────────────────────────────────────────────── tidyverse_conflicts() ──
-#> ✖ dplyr::between() masks data.table::between()
-#> ✖ dplyr::filter() masks stats::filter()
-#> ✖ dplyr::first() masks data.table::first()
-#> ✖ dplyr::lag() masks stats::lag()
-#> ✖ dplyr::last() masks data.table::last()
-#> ✖ purrr::transpose() masks data.table::transpose()
+#> ── Attaching core tidyverse packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
+#> ✔ dplyr 1.1.3 ✔ readr 2.1.4
+#> ✔ forcats 1.0.0 ✔ stringr 1.5.0
+#> ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
+#> ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
+#> ✔ purrr 1.0.2
+#> ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
+#> ✖ dplyr::between() masks data.table::between()
+#> ✖ dplyr::filter() masks stats::filter()
+#> ✖ dplyr::first() masks data.table::first()
+#> ✖ lubridate::hour() masks data.table::hour()
+#> ✖ lubridate::isoweek() masks data.table::isoweek()
+#> ✖ dplyr::lag() masks stats::lag()
+#> ✖ dplyr::last() masks data.table::last()
+#> ✖ lubridate::mday() masks data.table::mday()
+#> ✖ lubridate::minute() masks data.table::minute()
+#> ✖ lubridate::month() masks data.table::month()
+#> ✖ lubridate::quarter() masks data.table::quarter()
+#> ✖ lubridate::second() masks data.table::second()
+#> ✖ purrr::transpose() masks data.table::transpose()
+#> ✖ lubridate::wday() masks data.table::wday()
+#> ✖ lubridate::week() masks data.table::week()
+#> ✖ lubridate::yday() masks data.table::yday()
+#> ✖ lubridate::year() masks data.table::year()
+#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(medoutcon)
-#> medoutcon v0.1.6: Efficient Natural and Interventional Causal Mediation Analysis
-set.seed(1584)
+#> medoutcon v0.2.3: Efficient Natural and Interventional Causal Mediation Analysis
+set.seed(02138)
# produces a simple data set based on ca causal model with mediation
make_example_data <- function(n_obs = 1000) {
@@ -159,55 +172,59 @@ Example
y <- rbinom(n_obs, 1, plogis(1 / (rowSums(w) - z + a + m)))
## construct output
- dat <- as.data.table(cbind(w = w, a = a, z = z, m = m, y = y))
- setnames(dat, c(w_names, "A", "Z", m_names, "Y"))
+ dat <- as.data.table(cbind(w = w, a = a, z = z, m = m, y = y))
+ setnames(dat, c(w_names, "A", "Z", m_names, "Y"))
return(dat)
}
# set seed and simulate example data
-example_data <- make_example_data()
+example_data <- make_example_data(n_obs = 5000L)
w_names <- str_subset(colnames(example_data), "W")
m_names <- str_subset(colnames(example_data), "M")
# quick look at the data
head(example_data)
#> W_1 W_2 W_3 A Z M Y
-#> 1: 1 0 1 0 0 0 1
-#> 2: 0 1 0 0 0 1 0
-#> 3: 1 1 1 1 0 1 1
-#> 4: 0 1 1 0 0 1 0
-#> 5: 0 0 0 0 0 1 1
-#> 6: 1 0 1 1 0 1 0
+#> 1: 1 0 0 0 0 1 0
+#> 2: 0 0 0 0 0 0 1
+#> 3: 1 0 1 1 1 1 0
+#> 4: 1 0 1 1 0 1 1
+#> 5: 1 0 1 0 1 1 1
+#> 6: 1 0 0 0 0 1 0
# compute one-step estimate of the interventional direct effect
-os_de <- medoutcon(W = example_data[, ..w_names],
- A = example_data$A,
- Z = example_data$Z,
- M = example_data[, ..m_names],
- Y = example_data$Y,
- effect = "direct",
- estimator = "onestep")
+os_de <- medoutcon(
+ W = example_data[, ..w_names],
+ A = example_data$A,
+ Z = example_data$Z,
+ M = example_data[, ..m_names],
+ Y = example_data$Y,
+ effect = "direct",
+ estimator = "onestep"
+)
os_de
#> Interventional Direct Effect
#> Estimator: onestep
-#> Estimate: -0.065
-#> Std. Error: 0.054
-#> 95% CI: [-0.17, 0.041]
+#> Estimate: -0.101
+#> Std. Error: 0.028
+#> 95% CI: [-0.156, -0.045]
# compute targeted minimum loss estimate of the interventional direct effect
-tmle_de <- medoutcon(W = example_data[, ..w_names],
- A = example_data$A,
- Z = example_data$Z,
- M = example_data[, ..m_names],
- Y = example_data$Y,
- effect = "direct",
- estimator = "tmle")
+tmle_de <- medoutcon(
+ W = example_data[, ..w_names],
+ A = example_data$A,
+ Z = example_data$Z,
+ M = example_data[, ..m_names],
+ Y = example_data$Y,
+ effect = "direct",
+ estimator = "tmle"
+)
tmle_de
#> Interventional Direct Effect
#> Estimator: tmle
-#> Estimate: -0.06
-#> Std. Error: 0.058
-#> 95% CI: [-0.173, 0.053]
For details on how to use data adaptive regression (machine learning) techniques in the estimation of nuisance parameters, consider consulting the vignette that accompanies the package.
After using the medoutcon
R package, please cite the following:
@article{diaz2020nonparametric,
- ={Non-parametric efficient causal mediation with intermediate
- title
- confounders},={D{\'\i}az, Iv{\'a}n and Hejazi, Nima S and Rudolph, Kara E
- author and {van der Laan}, Mark J},
- year={2020},
- url = {https://arxiv.org/abs/1912.09936},
- doi = {10.1093/biomet/asaa085},
- journal={Biometrika},
- volume = {108},
- number = {3},
- pages = {627--641},
- publisher={Oxford University Press}
- }
-
- @article{hejazi2022medoutcon-joss,
- author = {Hejazi, Nima S and Rudolph, Kara E and D{\'\i}az,
- Iv{\'a}n},
- title = {{medoutcon}: Nonparametric efficient causal mediation
- analysis with machine learning in {R}},
- year = {2022},
- doi = {10.21105/joss.03979},
- url = {https://doi.org/10.21105/joss.03979},
- journal = {Journal of Open Source Software},
- publisher = {The Open Journal}
- }
-
- @software{hejazi2022medoutcon-rpkg,
- author={Hejazi, Nima S and D{\'\i}az, Iv{\'a}n and Rudolph, Kara E},
- title = {{medoutcon}: Efficient natural and interventional causal
- mediation analysis},
- year = {2022},
- doi = {10.5281/zenodo.5809519},
- url = {https://github.com/nhejazi/medoutcon},
- note = {R package version 0.1.6}
- }
@article{diaz2020nonparametric,
+ title={Non-parametric efficient causal mediation with intermediate
+ confounders},
+ author={D{\'\i}az, Iv{\'a}n and Hejazi, Nima S and Rudolph, Kara E
+ and {van der Laan}, Mark J},
+ year={2020},
+ url = {https://arxiv.org/abs/1912.09936},
+ doi = {10.1093/biomet/asaa085},
+ journal={Biometrika},
+ volume = {108},
+ number = {3},
+ pages = {627--641},
+ publisher={Oxford University Press}
+ }
+
+ @article{hejazi2022medoutcon-joss,
+ author = {Hejazi, Nima S and Rudolph, Kara E and D{\'\i}az,
+ Iv{\'a}n},
+ title = {{medoutcon}: Nonparametric efficient causal mediation
+ analysis with machine learning in {R}},
+ year = {2022},
+ doi = {10.21105/joss.03979},
+ url = {https://doi.org/10.21105/joss.03979},
+ journal = {Journal of Open Source Software},
+ publisher = {The Open Journal}
+ }
+
+ @software{hejazi2022medoutcon-rpkg,
+ author={Hejazi, Nima S and D{\'\i}az, Iv{\'a}n and Rudolph, Kara E},
+ title = {{medoutcon}: Efficient natural and interventional causal
+ mediation analysis},
+ year = {2024},
+ doi = {10.5281/zenodo.5809519},
+ url = {https://github.com/nhejazi/medoutcon},
+ note = {R package version 0.2.3}
+ }
© 2020-2022 Nima S. Hejazi
+© 2020-2024 Nima S. Hejazi
The contents of this repository are distributed under the MIT license. See below for details:
-
- MIT License
-Copyright (c) 2020-2022 Nima S. Hejazi
-
-
- Permission is hereby granted, free of charge, to any person obtaining a copyfiles (the "Software"), to deal
- of this software and associated documentation in the Software without restriction, including without limitation the rights
-/or sell
- to use, copy, modify, merge, publish, distribute, sublicense, and
- copies of the Software, and to permit persons to whom the Software is:
- furnished to do so, subject to the following conditions
-in all
- The above copyright notice and this permission notice shall be included
- copies or substantial portions of the Software.
-"AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- THE SOFTWARE IS PROVIDED
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
MIT License
+
+Copyright (c) 2020-2024 Nima S. Hejazi
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
sl3
: Modern Machine Learning Pipelines for Super Learning (version 1.4.4). https://doi.org/10.5281/zenodo.1342293.
+Coyle, Jeremy R, Nima S Hejazi, Ivana Malenica, Rachael V Phillips, and Oleg Sofrygin. 2021. “sl3
: Modern Machine Learning Pipelines for Super Learning.” https://doi.org/10.5281/zenodo.1342293.
NEWS.md
cv_stratify
to est_onestep()
and est_tml()
and to the estimator_args
list-argument in medoutcon()
, which allows for stratified folds to be generated for cross-fitting (by passing these to the strata_ids
argument of make_folds()
from the origami
package). This is also triggered by an override in est_onestep()
and est_tml()
when the proportion of detected cases is less than 0.1, a heuristic for rare outcomes.cv_folds = 10L
in named arguments to est_onestep()
and est_tml()
and to the estimator_args
list-argument in medoutcon()
.g_bounds
to c(0.005, 0.995)
from c(0.01, 0.99)
(in v0.22), based on sanity checks and manual experimentation.sl3_Task()
in which outcome_type = "continuous"
is specified in suppressWarnings()
to sink warnings when the outcome variable for a given nuisance estimation task fails sl3
’s check for continuous-ness.est_tml()
to use glm2::glm2
instead of stats::glm
to avoid issues with erratic IRLS in the latter; see https://journal.r-project.org/archive/2011/RJ-2011-012/ for details.g_bounds
by an order of magnitude, from c(0.001, 0.999)
to c(0.01, 0.99)
, to mitigate potential stability issues.est_tml()
.R/estimators.R
cv_eif.Rd
EIF for stochastic interventional (in)direct effects
+EIF for natural and interventional (in)direct effects
medoutcon
.
+
+A numeric
vector containing two values, the
+first being the minimum allowable estimated propensity score value and the
+second being the maximum allowable for estimated propensity score value.
R/estimators.R
est_onestep.Rd
One-step estimator for stochastic interventional (in)direct effects
+One-step estimator for natural and interventional (in)direct effects
A numeric
vector containing two values, the
+first being the minimum allowable estimated propensity score value and the
+second being the maximum allowable for estimated propensity score value.
A character
indicating whether components of the
interventional or natural (in)direct effects are to be estimated. In the
@@ -196,6 +204,15 @@
make_folds
, this value specified must be greater
than or equal to 2; the default is to create 5 folds.A logical
atomic vector indicating whether V-fold
+cross-validation should stratify the folds based on the outcome variable.
+If TRUE
, the folds are stratified by passing the outcome variable to
+the strata_ids
argument of make_folds
. While
+the default is FALSE
, an override is triggered when the incidence of
+the binary outcome variable falls below 0.1.
R/estimators.R
est_tml.Rd
TML estimator for stochastic interventional (in)direct effects
+TML estimator for natural and interventional (in)direct effects
A numeric
vector containing two values, the
+first being the minimum allowable estimated propensity score value and the
+second being the maximum allowable for estimated propensity score value.
A character
indicating whether components of the
interventional or natural (in)direct effects are to be estimated. In the
@@ -199,6 +207,15 @@
A logical
atomic vector indicating whether V-fold
+cross-validation should stratify the folds based on the outcome variable.
+If TRUE
, the folds are stratified by passing the outcome variable to
+the strata_ids
argument of make_folds
. While
+the default is FALSE
, an override is triggered when the incidence of
+the binary outcome variable falls below 0.1.
A numeric
integer giving the maximum number of steps
to be taken for the iterative procedure to construct a TML estimator.
Stack
, or other learner class (inheriting
from Lrnr_base
), containing a set of learners from
-sl3, to be used in fitting a propensity score models, i.e., g :=
-P(A = 1 | W) and h := P(A = 1 | M, W).
A character
indicating which of the treatment mechanism
-variants to estimate. Option "g"
corresponds to the propensity score
-g(A|W) while option "h"
conditions on the mediators h(A|M,W).
"g"
is the propensity score g(A|W)
+while option "h"
is a re-parameterized mediator density h(A|M,W).
+
+
+A numeric
vector containing two values, the first being
+the minimum allowable estimated propensity score value and the second
+being the maximum allowable for estimated propensity score value.
Confidence intervals for interventional mediation effect estimates
Confidence intervals for natural/interventional (in)direct effect estimates
Efficient estimation of stochastic interventional (in)direct effects
Efficient estimation of natural and interventional (in)direct effects
Print method for interventional mediation effect estimate objects
Print method for natural/interventional (in)direct effect estimate objects
Summary for interventional mediation effect estimate objects
Summary for natural/interventional (in)direct effect estimate objects
R/medoutcon.R
medoutcon.Rd
Efficient estimation of stochastic interventional (in)direct effects
+Efficient estimation of natural and interventional (in)direct effects
A logical
vector indicating whether a sampled observation's
mediator was measured via a two-phase sampling design. Defaults to a
-vector of ones, implying that two-phase sampling was not performed.
A list
of extra arguments to be passed (via
+
A list
of additional arguments passed (via
...
) to the function call for the specified estimator. The default
-is chosen so as to allow the number of folds used in computing the one-step
-or TML estimators to be easily adjusted. In the case of the TML estimator,
-the number of update (fluctuation) iterations is limited, and a tolerance
-is included for the updates introduced by the tilting (fluctuation) models.
A numeric
vector containing two values, the first
+being the minimum allowable estimated propensity score value and the
+second being the maximum allowable for estimated propensity scores.
R/utils.R
print.medoutcon.Rd
R/utils.R
summary.medoutcon.Rd