diff --git a/NAMESPACE b/NAMESPACE index 7bcc23c..92ed15d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -64,12 +64,14 @@ S3method(summary,tsls) S3method(tsls,default) S3method(tsls,formula) export(LassoShooting.fit) +export(RMD_stable) export(lambdaCalculation) export(p_adjust) export(print_coef) export(rlasso) export(rlassoATE) export(rlassoATET) +export(rlassoAutoDML) export(rlassoEffect) export(rlassoEffects) export(rlassoIV) diff --git a/R/LassoShooting.fit.R b/R/LassoShooting.fit.R index e7bcbbe..8f2d39e 100644 --- a/R/LassoShooting.fit.R +++ b/R/LassoShooting.fit.R @@ -27,14 +27,18 @@ #' @export -LassoShooting.fit <- function(x, y, lambda, control = list(maxIter = 1000, - optTol = 10^(-5), zeroThreshold = 10^(-6)), XX = NULL, Xy = NULL, beta.start = NULL) { +LassoShooting.fit <- function(x, y, lambda, control = list( + maxIter = 1000, + optTol = 10^(-5), zeroThreshold = 10^(-6) + ), XX = NULL, Xy = NULL, beta.start = NULL) { n <- dim(x)[1] p <- dim(x)[2] - if (is.null(XX)) + if (is.null(XX)) { (XX <- crossprod(x)) - if (is.null(Xy)) + } + if (is.null(Xy)) { (Xy <- crossprod(x, y)) + } # Start from the LS solution for beta if no beta.start is provided if (is.null(beta.start)) { # Ridge start beta <- MASS::ginv(XX + diag(as.vector(lambda), p) %*% @@ -49,7 +53,7 @@ LassoShooting.fit <- function(x, y, lambda, control = list(maxIter = 1000, m <- 1 XX2 <- XX * 2 Xy2 <- Xy * 2 - + while (m < control$maxIter) { beta_old <- beta for (j in 1:p) { @@ -59,13 +63,16 @@ LassoShooting.fit <- function(x, y, lambda, control = list(maxIter = 1000, beta[j] <- 0 next } - - if (S0 > lambda[j]) - beta[j] <- (lambda[j] - S0)/XX2[j, j] - if (S0 < -1 * lambda[j]) - beta[j] <- (-1 * lambda[j] - S0)/XX2[j, j] - if (abs(S0) <= lambda[j]) + + if (S0 > lambda[j]) { + beta[j] <- (lambda[j] - S0) / XX2[j, j] + } + if (S0 < -1 * lambda[j]) { + beta[j] <- (-1 * lambda[j] - S0) / XX2[j, j] + } + if (abs(S0) <= lambda[j]) { beta[j] <- 0 + } } # Update wp <- cbind(wp, beta) @@ -78,4 +85,4 @@ LassoShooting.fit <- function(x, y, lambda, control = list(maxIter = 1000, w <- beta w[abs(w) < control$zeroThreshold] <- 0 return(list(coefficients = w, coef.list = wp, num.it = m)) -} \ No newline at end of file +} diff --git a/R/help_functions.R b/R/help_functions.R index 800e34c..03dc148 100644 --- a/R/help_functions.R +++ b/R/help_functions.R @@ -1,5 +1,9 @@ -format.perc <- function(probs, digits) paste(format(100 * probs, trim = TRUE, - scientific = FALSE, digits = digits), "%") +format.perc <- function(probs, digits) { + paste(format(100 * probs, + trim = TRUE, + scientific = FALSE, digits = digits + ), "%") +} # function for calculation of the errors after choosing the five # variables with the highest correlation @@ -16,7 +20,7 @@ init_values <- function(X, y, number = 5, intercept = TRUE) { reg <- lm(y ~ -1 + X[, index, drop = FALSE]) coefficients[index] <- coef(reg) } - coefficients[is.na( coefficients)] <- 0 + coefficients[is.na(coefficients)] <- 0 res <- list(residuals = reg$residuals, coefficients = coefficients) return(res) } @@ -24,12 +28,14 @@ init_values <- function(X, y, number = 5, intercept = TRUE) { # function for escaping [ and other special characters in formula calls -re.escape <- function(strings){ - vals <- c("\\\\", "\\[", "\\]", "\\(", "\\)", - "\\{", "\\}", "\\^", "\\$","\\*", - "\\+", "\\?", "\\.", "\\|") +re.escape <- function(strings) { + vals <- c( + "\\\\", "\\[", "\\]", "\\(", "\\)", + "\\{", "\\}", "\\^", "\\$", "\\*", + "\\+", "\\?", "\\.", "\\|" + ) replace.vals <- paste0("\\\\", vals) - for(i in seq_along(vals)){ + for (i in seq_along(vals)) { strings <- gsub(vals[i], replace.vals[i], strings) } strings @@ -39,165 +45,167 @@ re.escape <- function(strings){ ######################## functions for extracting x,y,d,z from formula element ##### Define the function f.formula -f.formula <- function(formula, data, all.categories = FALSE){ +f.formula <- function(formula, data, all.categories = FALSE) { cl <- match.call() - if (missing(data)) data <- environment(formula) + if (missing(data)) data <- environment(formula) mf <- match.call(expand.dots = FALSE) m <- match(c("formula", "data"), names(mf), 0) mf <- mf[c(1L, m)] mf$drop.unused.levels <- TRUE - + ### if 'formula' and 'data' arguments are in reverse order: "switch", i.e. restore intended order/re-name - if( any(c("formula", "Formula", "character") %in% is(data)) & - any(c("data.frame", "matrix") %in% is(formula)) ){ + if (any(c("formula", "Formula", "character") %in% is(data)) & + any(c("data.frame", "matrix") %in% is(formula))) { mf$formula <- data mf$data <- formula formula <- mf$formula data <- mf$data } - ### - + ### + matrix.flag <- FALSE - if(is.matrix(data)){ + if (is.matrix(data)) { warning("'data' is a matrix and has been converted to a data.frame.") data <- as.data.frame(data) matrix.flag <- TRUE } - + ### Add (potentially "updated") 'data' to model.frame mf$data <- data - + formula <- Formula::as.Formula(formula) - if(!identical(x=length(formula), y=c(1L, 2L))){ - if(length(formula)[1] < 1L) stop("The LHS of formula must include a response variable.", call. = FALSE) - if(length(formula)[1] > 1L) stop("Multiple responses not implemented. The LHS of formula must include exactly one response variable.", call. = FALSE) - if(length(formula)[2] != 2L) stop("RHS of formula is of improper length. formula must be specified as y ~ x + d | x + z, where y is response, x are exogenous regressors (on both sides of '|'), d are endogenous regressors (treatments), and z are instruments. Use of '.' to the right of '|' is permitted.", call. = FALSE) + if (!identical(x = length(formula), y = c(1L, 2L))) { + if (length(formula)[1] < 1L) stop("The LHS of formula must include a response variable.", call. = FALSE) + if (length(formula)[1] > 1L) stop("Multiple responses not implemented. The LHS of formula must include exactly one response variable.", call. = FALSE) + if (length(formula)[2] != 2L) stop("RHS of formula is of improper length. formula must be specified as y ~ x + d | x + z, where y is response, x are exogenous regressors (on both sides of '|'), d are endogenous regressors (treatments), and z are instruments. Use of '.' to the right of '|' is permitted.", call. = FALSE) } - - f1 <- formula(formula, rhs = 1) # "y ~ x + d" - part - f2 <- formula(formula, lhs = 0, rhs = 2) # " ~ x + z" or " ~ . - d + z" - formula <- Formula::as.Formula(f1, update(formula(formula, lhs = 0, rhs = 1), f2)) # update (only relevant if a `.` to the right of |) - - + + f1 <- formula(formula, rhs = 1) # "y ~ x + d" - part + f2 <- formula(formula, lhs = 0, rhs = 2) # " ~ x + z" or " ~ . - d + z" + formula <- Formula::as.Formula(f1, update(formula(formula, lhs = 0, rhs = 1), f2)) # update (only relevant if a `.` to the right of |) + + # Warning if (some) variable names are given as 'data[, 1]' or 'data[, "y"] instead of simply 'y'. - if(any(grepl("[", as.character(formula), fixed=TRUE))){ - if(matrix.flag == TRUE){ + if (any(grepl("[", as.character(formula), fixed = TRUE))) { + if (matrix.flag == TRUE) { stop("Use of '[...]' for variable names, as in 'data[, 1]' or data[, \"y\"], is not possible if 'data' is a matrix. \n '[...]' works for most cases if 'data' is a data.frame, but its use is still discouraged.") - }else{ - warning("Use of '[...]' for variable names, as in 'data[, 1]' or data[, \"y\"], is discouraged. It will work for most cases, especially when used consistently. Checking if the LHS variable is also included on the RHS is not available and setting 'all.categories' to TRUE has no effect.", call. = FALSE) + } else { + warning("Use of '[...]' for variable names, as in 'data[, 1]' or data[, \"y\"], is discouraged. It will work for most cases, especially when used consistently. Checking if the LHS variable is also included on the RHS is not available and setting 'all.categories' to TRUE has no effect.", call. = FALSE) } } # Warning if (some) variable names are given as 'data$x' instead of simply 'x'. - if(any(grepl("$", as.character(formula), fixed=TRUE))){ - if(matrix.flag == TRUE){ - stop("Use of '$' for variable names, as in 'data$y ~ data$x1 + data$d1 | data$x1 + data$z1', is not possible if 'data' is a matrix. \n '$' works for most cases if 'data' is a data.frame, but its use is still discouraged.") - }else{ - warning("Use of '$' for variable names, as in 'data$y ~ data$x1 + data$d1 | data$x1 + data$z1', is discouraged. It will work for most cases, especially when used consistently. Checking if the LHS variable is also included on the RHS is not available and setting 'all.categories' to TRUE has no effect.", call. = FALSE) + if (any(grepl("$", as.character(formula), fixed = TRUE))) { + if (matrix.flag == TRUE) { + stop("Use of '$' for variable names, as in 'data$y ~ data$x1 + data$d1 | data$x1 + data$z1', is not possible if 'data' is a matrix. \n '$' works for most cases if 'data' is a data.frame, but its use is still discouraged.") + } else { + warning("Use of '$' for variable names, as in 'data$y ~ data$x1 + data$d1 | data$x1 + data$z1', is discouraged. It will work for most cases, especially when used consistently. Checking if the LHS variable is also included on the RHS is not available and setting 'all.categories' to TRUE has no effect.", call. = FALSE) } - - } + + } # Check: were there are any variables from LHS used on the RHS (to the left or to the right of | )? - if( any( setdiff(all.vars(formula(formula, lhs=1, rhs=0)), c( deparse(substitute(data)) )) %in% # Note: remove the name of the data set to avoid wrong errors (in case of variable names given as data$y) - all.vars(formula(formula, lhs=0, rhs=1:2)) ) ){ - if(missing(data)){ + if (any(setdiff(all.vars(formula(formula, lhs = 1, rhs = 0)), c(deparse(substitute(data)))) %in% # Note: remove the name of the data set to avoid wrong errors (in case of variable names given as data$y) + all.vars(formula(formula, lhs = 0, rhs = 1:2)))) { + if (missing(data)) { stop("Argument 'data' is missing.", call. = FALSE) - }else{ - stop("LHS and RHS of formula must not include the same variable(s).", call. = FALSE) + } else { + stop("LHS and RHS of formula must not include the same variable(s).", call. = FALSE) } } - - + + # Add updated formula to model.frame mf$formula <- formula ##### - - + + mf[[1L]] <- as.name("model.frame") # Anm: as in ivreg(), whereas in lasso.formula: mf[[1L]] <- quote(stats::model.frame) mf <- eval(mf, parent.frame()) - - + + ##### ## Extract the response variable # - only one response variable allowed; stopped above with error() if more than one response variable # - extract -after transformation (if any)- response as a single column matrix - - Y <- as.matrix(model.part(formula, data = mf, lhs = 1, rhs=0, terms=FALSE, drop = FALSE)) + + Y <- as.matrix(model.part(formula, data = mf, lhs = 1, rhs = 0, terms = FALSE, drop = FALSE)) # Anm: in ivreg and in lasso.formula: Y <- model.response(mf, "numeric") # -> model.part() works even for multiple responses where model.response() will fail ##### - - - + + + ##### ## Extract X, D, and Z # Reminder: formula is akin to y ~ x + d | x + z # where x are exogenous, d endogenous, and z instruments (used only for the first stage) # the second part (x + z) contains exogenous variables and instruments # -> it has already been updated (see above) - exin <- formula(formula, lhs=0, rhs=2) - + exin <- formula(formula, lhs = 0, rhs = 2) + # the first part (x + d) contains exogenous and endogenous variables - exen <- formula(formula, lhs=0, rhs=1) - + exen <- formula(formula, lhs = 0, rhs = 1) + # names of the components in X, D, and Z x <- intersect(attr(terms(exen), "term.labels"), attr(terms(exin), "term.labels")) - d <- setdiff (attr(terms(exen), "term.labels"), attr(terms(exin), "term.labels")) # {x, d} \ {x, z} = {d} - z <- setdiff (attr(terms(exin), "term.labels"), attr(terms(exen), "term.labels")) # {x, z} \ {x, d} = {z} - + d <- setdiff(attr(terms(exen), "term.labels"), attr(terms(exin), "term.labels")) # {x, d} \ {x, z} = {d} + z <- setdiff(attr(terms(exin), "term.labels"), attr(terms(exen), "term.labels")) # {x, z} \ {x, d} = {z} + if (is.environment(data)) data <- mf - - if (length(d)==0) { + + if (length(d) == 0) { D <- NULL } else { - conlist.D <- sapply(data, is.factor) # factor variables in data - form.D <- as.formula(paste(" ~ ", paste(d, collapse= " + ", sep=""), sep="")) # formula for D - conlist.D[which(!(names(conlist.D) %in% attr(terms(form.D), "term.labels")))] <- FALSE # TRUE for all factor variables in D - D <- model.matrix(form.D, data = mf, contrasts.arg = lapply(data[, conlist.D, drop = FALSE], contrasts, contrasts = !all.categories)) - D <- D[, -which(colnames(D) == "(Intercept)"), drop = FALSE] + conlist.D <- sapply(data, is.factor) # factor variables in data + form.D <- as.formula(paste(" ~ ", paste(d, collapse = " + ", sep = ""), sep = "")) # formula for D + conlist.D[which(!(names(conlist.D) %in% attr(terms(form.D), "term.labels")))] <- FALSE # TRUE for all factor variables in D + D <- model.matrix(form.D, data = mf, contrasts.arg = lapply(data[, conlist.D, drop = FALSE], contrasts, contrasts = !all.categories)) + D <- D[, -which(colnames(D) == "(Intercept)"), drop = FALSE] } - - if (length(z)==0) { + + if (length(z) == 0) { Z <- NULL } else { - conlist.Z <- sapply(data, is.factor) # factor variables in data - form.Z <- as.formula(paste(" ~ ", paste(z, collapse= " + ", sep=""), sep="")) # formula for Z - conlist.Z[which(!(names(conlist.Z) %in% attr(terms(form.Z), "term.labels")))] <- FALSE # TRUE for all factor variables in Z - Z <- model.matrix(form.Z, data = mf, contrasts.arg = lapply(data[, conlist.Z, drop = FALSE], contrasts, contrasts = !all.categories)) - Z <- Z[, -which(colnames(Z) == "(Intercept)"), drop = FALSE] + conlist.Z <- sapply(data, is.factor) # factor variables in data + form.Z <- as.formula(paste(" ~ ", paste(z, collapse = " + ", sep = ""), sep = "")) # formula for Z + conlist.Z[which(!(names(conlist.Z) %in% attr(terms(form.Z), "term.labels")))] <- FALSE # TRUE for all factor variables in Z + Z <- model.matrix(form.Z, data = mf, contrasts.arg = lapply(data[, conlist.Z, drop = FALSE], contrasts, contrasts = !all.categories)) + Z <- Z[, -which(colnames(Z) == "(Intercept)"), drop = FALSE] } - - if (length(x)==0) { + + if (length(x) == 0) { X <- NULL } else { - conlist.X <- sapply(data, is.factor) # factor variables in data - form.X <- as.formula(paste(" ~ ", paste(x, collapse= " + ", sep=""), sep="")) # formula for X - conlist.X[which(!(names(conlist.X) %in% attr(terms(form.X), "term.labels")))] <- FALSE # TRUE for all factor variables in X - X <- model.matrix(form.X, data = mf, contrasts.arg = lapply(data[, conlist.X, drop = FALSE], contrasts, contrasts = !all.categories)) - X <- X[, -which(colnames(X) == "(Intercept)"), drop = FALSE] + conlist.X <- sapply(data, is.factor) # factor variables in data + form.X <- as.formula(paste(" ~ ", paste(x, collapse = " + ", sep = ""), sep = "")) # formula for X + conlist.X[which(!(names(conlist.X) %in% attr(terms(form.X), "term.labels")))] <- FALSE # TRUE for all factor variables in X + X <- model.matrix(form.X, data = mf, contrasts.arg = lapply(data[, conlist.X, drop = FALSE], contrasts, contrasts = !all.categories)) + X <- X[, -which(colnames(X) == "(Intercept)"), drop = FALSE] } - - attr(D, "assign") <- NULL ; attr(Z, "assign") <- NULL ; attr(X, "assign") <- NULL + + attr(D, "assign") <- NULL + attr(Z, "assign") <- NULL + attr(X, "assign") <- NULL ##### - - + + ###### - #Extract intercepts for the x + d part (intercept2) and for the x + z part (intercept1): TRUE or FALSE. - if(attr(terms(exen), "intercept") == 1L){ + # Extract intercepts for the x + d part (intercept2) and for the x + z part (intercept1): TRUE or FALSE. + if (attr(terms(exen), "intercept") == 1L) { intercept2 <- TRUE - }else{ + } else { intercept2 <- FALSE } - if(attr(terms(exin), "intercept") == 1L){ + if (attr(terms(exin), "intercept") == 1L) { intercept1 <- TRUE - }else{ + } else { intercept1 <- FALSE } - + Y <- as.vector(Y) - + return(list(Y = Y, X = X, D = D, Z = Z, intercept1 = intercept1, intercept2 = intercept2)) ###### } @@ -206,122 +214,140 @@ f.formula <- function(formula, data, all.categories = FALSE){ ############################## help function for tracing the position of selected variables specified by a one-sided formula include <- function(I, data) { - -I.orig <- I # save original I -if(!is.null(I)){ - # error message if I is not a formula/Formula or numeric/integer or logical: - if(!any(c("formula", "Formula", "numeric", "integer", "logical") %in% class(I))){ - stop("'I' must be either an object of class formula/Formula or a numeric/integer or logical vector.") - } - if(any(c("numeric", "integer","logical") %in% class(I))){ - - # convert the x + d part of the rhs of formula to character - f1.no.y <- as.character(formula(formula, rhs = 1, lhs = 0))[[2]] - # f1.no.y is to be split by the '+'-operator - # if '-'-operator is used (e.g., y ~ -1 + x1 + ...), the '+' needs to be added beforehand: - f1.no.y <- gsub(pattern = "-", replacement ="\\+ -", f1.no.y, - ignore.case = FALSE, perl = FALSE, - fixed = FALSE, useBytes = FALSE) - # convert into character and splitting by '+'-operator: - f1.no.y.string <- as.vector(unlist(strsplit(f1.no.y, "\\+ "))) - # eliminate empty entries (i.e.: "") - f1.no.y.string <- f1.no.y.string[which(f1.no.y.string != "")] - # the elements in f1.no.y.string can now be indexed by numeric/integer or logical 'I' - - - - if(class(I.orig) %in% c("numeric", "integer")){ - # check: for numeric/integer I, no 'non-existent' elements can be selected - # i.e. if 'formula' has k elements between '~' and '|', only 1, 2, ..., k are allowed - # but -5, 0, k+1, 1.5, etc. are not allowed - if(!all(I.orig %in% 1:length(f1.no.y.string))){ - index.in <- I.orig %in% 1:length(f1.no.y.string) # logical index vector (problematic entries = FALSE) - stop(paste("'formula' was determined to have ", length(f1.no.y.string), - " elements between '~' and '|'. The numeric/integer 'I' that was provided tried to select a non-existent element (e.g., 0, a negative number, a non-integer number, or a number greater than ", length(f1.no.y.string), "). The problematic entries in I were: ", - paste(I.orig[!index.in], collapse = ",", sep=" "), ".", sep="")) + I.orig <- I # save original I + + if (!is.null(I)) { + # error message if I is not a formula/Formula or numeric/integer or logical: + if (!any(c("formula", "Formula", "numeric", "integer", "logical") %in% class(I))) { + stop("'I' must be either an object of class formula/Formula or a numeric/integer or logical vector.") + } + if (any(c("numeric", "integer", "logical") %in% class(I))) { + + # convert the x + d part of the rhs of formula to character + f1.no.y <- as.character(formula(formula, rhs = 1, lhs = 0))[[2]] + # f1.no.y is to be split by the '+'-operator + # if '-'-operator is used (e.g., y ~ -1 + x1 + ...), the '+' needs to be added beforehand: + f1.no.y <- gsub( + pattern = "-", replacement = "\\+ -", f1.no.y, + ignore.case = FALSE, perl = FALSE, + fixed = FALSE, useBytes = FALSE + ) + # convert into character and splitting by '+'-operator: + f1.no.y.string <- as.vector(unlist(strsplit(f1.no.y, "\\+ "))) + # eliminate empty entries (i.e.: "") + f1.no.y.string <- f1.no.y.string[which(f1.no.y.string != "")] + # the elements in f1.no.y.string can now be indexed by numeric/integer or logical 'I' + + + + if (class(I.orig) %in% c("numeric", "integer")) { + # check: for numeric/integer I, no 'non-existent' elements can be selected + # i.e. if 'formula' has k elements between '~' and '|', only 1, 2, ..., k are allowed + # but -5, 0, k+1, 1.5, etc. are not allowed + if (!all(I.orig %in% 1:length(f1.no.y.string))) { + index.in <- I.orig %in% 1:length(f1.no.y.string) # logical index vector (problematic entries = FALSE) + stop(paste("'formula' was determined to have ", length(f1.no.y.string), + " elements between '~' and '|'. The numeric/integer 'I' that was provided tried to select a non-existent element (e.g., 0, a negative number, a non-integer number, or a number greater than ", length(f1.no.y.string), "). The problematic entries in I were: ", + paste(I.orig[!index.in], collapse = ",", sep = " "), ".", + sep = "" + )) + } + } + # check: for logical I, it has to have the appropriate length: + if (class(I.orig) == "logical" & length(I.orig) != length(f1.no.y.string)) { + stop(paste("If 'I' is logical, its length has to be the same as the number of 'elements' (", + length(f1.no.y.string), + ") in the 'x + d'-part of 'formula'.", + sep = "" + )) } + + # I is now a formula as implied by the user-specified I, i.e. the 'formula' ''indexed'' by the user-specified I + I <- as.formula(paste("~ ", paste(f1.no.y.string[I], collapse = " + "), sep = "")) } - # check: for logical I, it has to have the appropriate length: - if(class(I.orig) == "logical" & length(I.orig) != length(f1.no.y.string)){ - stop(paste("If 'I' is logical, its length has to be the same as the number of 'elements' (", - length(f1.no.y.string), - ") in the 'x + d'-part of 'formula'.", sep="")) + + + # error message if I is too long. + # formula-class: length must be 2 + # Formula-class: length must be c(0, 1) + if ("formula" %in% class(I)) { + if (!("Formula" %in% class(I))) { + if (length(I) > 2L) { + stop("'I' must be a one-sided formula, e.g. '~ x1 + x2'.") + } + } else { + if (length(I)[1] > 0L) { + stop("'I' must be a one-sided formula: '~ x1 + x2' is ok, 'y ~ x1 + x2' is not.") + } + if (length(I)[2] > 1L) { + stop("'I' must be a one-sided formula, '~ x1 + x2' is ok, '~ x1 + x2 | z1 + z2' is not.") + } + } } - - # I is now a formula as implied by the user-specified I, i.e. the 'formula' ''indexed'' by the user-specified I - I <- as.formula(paste("~ ", paste(f1.no.y.string[I], collapse = " + "), sep="")) - } - - - # error message if I is too long. - # formula-class: length must be 2 - # Formula-class: length must be c(0, 1) - if("formula" %in% class(I)){ - if(!("Formula" %in% class(I))){ - if(length(I) > 2L){ stop("'I' must be a one-sided formula, e.g. '~ x1 + x2'.") } - }else{ - if(length(I)[1] > 0L){ stop("'I' must be a one-sided formula: '~ x1 + x2' is ok, 'y ~ x1 + x2' is not.") } - if(length(I)[2] > 1L){ stop("'I' must be a one-sided formula, '~ x1 + x2' is ok, '~ x1 + x2 | z1 + z2' is not.") } + + + # names of the components in X that are selected via the argument I: + i <- attr(terms(as.formula(I)), "term.labels") + + # error: if formula is like y ~ 1 + x1 + ..., or y ~ 0 + x1 + ..., or y ~ -1 + x1 + ..., + # and if I is ~ 1, or c(1), or c(TRUE, FALSE, FALSE, ...), + # then I did not selected any terms/variables, but only the intercept. + # In this case, the just constructed 'i' is 'character(0)', and operating on such an 'i' + # would yield an uninformative error. + # Instead so an informative error is produced in the next lines of code. + if (length(i) == 0L) { + stop("Argument 'I' did not select any (non-intercept) terms from 'formula'. Most likely reason: 'formula' explicitly addressed the intercept (i.e., -1, 0, 1) and this is also the only component from 'formula' that is selected via 'I'.") } + + + # construction of X.I: the model matrix X with only those columns which are selected via I: + # (construction of X.I is analog to that of X, D, and Z) + conlist.X.I <- sapply(data, is.factor) # factor variables in data + form.X.I <- as.formula(paste(" ~ ", paste(i, collapse = " + ", sep = ""), sep = "")) # formula for X.I + conlist.X.I[which(!(names(conlist.X.I) %in% attr(terms(form.X.I), "term.labels")))] <- FALSE # TRUE for all factor variables in X + X.I <- model.matrix(form.X.I, data = data, contrasts.arg = lapply(data[, conlist.X.I, drop = FALSE], contrasts, contrasts = !all.categories)) + X.I <- X.I[, -which(colnames(X.I) == "(Intercept)"), drop = FALSE] + attr(X.I, "assign") <- NULL + + # check: are all columns (columns names in X.I) also in X? if not: stop with error. + if (!all(colnames(X.I) %in% colnames(X))) { + stop(paste("'I' resulted in column names that are not in X. Please only use terms in I as they appear in the 'x'-part of formula and no 'd'- or 'z'-terms. Also note that any x-term must be present both to the left and to the right of '|' in 'formula'. \n Encountered column names that result from I but are not in X: ", + paste(setdiff(colnames(X.I), colnames(X)), collapse = ", "), + ".\n", + sep = "" + )) + } + + I.c <- which(colnames(X) %in% colnames(X.I)) + + + # check: for each column in X.I: check if the column in X with the same name has the same values as the column in X.I + # (a security check) + equal.values.invariables <- function(number, X.matrix = X, X.I.matrix = X.I) { + col.name <- colnames(X.matrix)[number] # column name of the column in 'X.matrix' given by 'number' + # as all.equal returns TRUE or a message, and not simply TRUE/FALSE, use: isTRUE(all.equal(...)) + return(isTRUE(all.equal(X.matrix[, col.name, drop = FALSE], X.I.matrix[, col.name, drop = FALSE], check.attributes = FALSE))) + } # function for comparison + list.equal.values <- sapply(I.c, FUN = equal.values.invariables, X.matrix = X, X.I.matrix = X.I) # FALSE for those columns with non-matching entries + # if there are any columns with non-matching entries: stop with error message that includes the colnames of the problematic columns: + if (any(list.equal.values == FALSE)) { + stop(paste("'I' resulted in at least one column that has the same name in X and X.I (the design matrix created from 'I') but has different values in X and X.I. One possible scenario: a factor variable, say 'x5' (levels: A, B, C, D), is included in 'formula' so that dummies x5A, x5B, x5C, x5D are created for design matrix X. If there is already a (different) variable named, say 'x5B', in 'data' and it is included in 'I', 'x5B' in X.I will be the column from 'data' and not the one created by model.matrix() for X.\n The problematic column names are: ", + paste(colnames(X)[I.c[!list.equal.values]], collapse = ",", sep = ", "), ".\n", + sep = "" + )) + } + + + } - - - # names of the components in X that are selected via the argument I: - i <- attr(terms(as.formula(I)), "term.labels") - - # error: if formula is like y ~ 1 + x1 + ..., or y ~ 0 + x1 + ..., or y ~ -1 + x1 + ..., - # and if I is ~ 1, or c(1), or c(TRUE, FALSE, FALSE, ...), - # then I did not selected any terms/variables, but only the intercept. - # In this case, the just constructed 'i' is 'character(0)', and operating on such an 'i' - # would yield an uninformative error. - # Instead so an informative error is produced in the next lines of code. - if(length(i) == 0L){ - stop("Argument 'I' did not select any (non-intercept) terms from 'formula'. Most likely reason: 'formula' explicitly addressed the intercept (i.e., -1, 0, 1) and this is also the only component from 'formula' that is selected via 'I'.") - } - - - # construction of X.I: the model matrix X with only those columns which are selected via I: - # (construction of X.I is analog to that of X, D, and Z) - conlist.X.I <- sapply(data, is.factor) # factor variables in data - form.X.I <- as.formula(paste(" ~ ", paste(i, collapse= " + ", sep=""), sep="")) # formula for X.I - conlist.X.I[which(!(names(conlist.X.I) %in% attr(terms(form.X.I), "term.labels")))] <- FALSE # TRUE for all factor variables in X - X.I <- model.matrix(form.X.I, data = data, contrasts.arg = lapply(data[, conlist.X.I, drop = FALSE], contrasts, contrasts = !all.categories)) - X.I <- X.I[, -which(colnames(X.I) == "(Intercept)"), drop = FALSE] - attr(X.I, "assign") <- NULL - - # check: are all columns (columns names in X.I) also in X? if not: stop with error. - if(!all(colnames(X.I) %in% colnames(X))){ - stop(paste("'I' resulted in column names that are not in X. Please only use terms in I as they appear in the 'x'-part of formula and no 'd'- or 'z'-terms. Also note that any x-term must be present both to the left and to the right of '|' in 'formula'. \n Encountered column names that result from I but are not in X: ", - paste(setdiff(colnames(X.I), colnames(X)), collapse=", "), - ".\n", sep="")) - } - - I.c <- which(colnames(X) %in% colnames(X.I)) - - - # check: for each column in X.I: check if the column in X with the same name has the same values as the column in X.I - # (a security check) - equal.values.invariables <- function(number, X.matrix = X, X.I.matrix = X.I){ - col.name <- colnames(X.matrix)[number] # column name of the column in 'X.matrix' given by 'number' - # as all.equal returns TRUE or a message, and not simply TRUE/FALSE, use: isTRUE(all.equal(...)) - return( isTRUE(all.equal(X.matrix[, col.name, drop = FALSE], X.I.matrix[, col.name, drop = FALSE], check.attributes = FALSE))) - } # function for comparison - list.equal.values <- sapply(I.c, FUN = equal.values.invariables, X.matrix = X, X.I.matrix = X.I) # FALSE for those columns with non-matching entries - # if there are any columns with non-matching entries: stop with error message that includes the colnames of the problematic columns: - if(any(list.equal.values == FALSE)){ - stop( paste("'I' resulted in at least one column that has the same name in X and X.I (the design matrix created from 'I') but has different values in X and X.I. One possible scenario: a factor variable, say 'x5' (levels: A, B, C, D), is included in 'formula' so that dummies x5A, x5B, x5C, x5D are created for design matrix X. If there is already a (different) variable named, say 'x5B', in 'data' and it is included in 'I', 'x5B' in X.I will be the column from 'data' and not the one created by model.matrix() for X.\n The problematic column names are: ", - paste(colnames(X)[I.c[!list.equal.values]], collapse = ",", sep=", "), ".\n", sep="") ) - } - - - -} -# generate I.c = NULL in case no I is provided, so that I.c = NULL can be passed on to workhorse function -if(is.null(I)){ I.c <- NULL } + # generate I.c = NULL in case no I is provided, so that I.c = NULL can be passed on to workhorse function + if (is.null(I)) { + I.c <- NULL + } -return(I.c) + return(I.c) } ####################################################################################### @@ -329,16 +355,16 @@ return(I.c) ############################## help function for tracing the position of selected variables specified by a one-sided formula # include <- function(I, data) { -# +# # I.orig <- I # save original I -# +# # if(!is.null(I)){ # # error message if I is not a formula/Formula or numeric/integer or logical: # if(!any(c("formula", "Formula", "numeric", "integer", "logical") %in% class(I))){ # stop("'I' must be either an object of class formula/Formula or a numeric/integer or logical vector.") # } # if(any(c("numeric", "integer","logical") %in% class(I))){ -# +# # # convert the x + d part of the rhs of formula to character # f1.no.y <- as.character(formula(formula, rhs = 1, lhs = 0))[[2]] # # f1.no.y is to be split by the '+'-operator @@ -351,13 +377,13 @@ return(I.c) # # eliminate empty entries (i.e.: "") # f1.no.y.string <- f1.no.y.string[which(f1.no.y.string != "")] # # the elements in f1.no.y.string can now be indexed by numeric/integer or logical 'I' -# -# -# +# +# +# # if(class(I.orig) %in% c("numeric", "integer")){ # # check: for numeric/integer I, no 'non-existent' elements can be selected # # i.e. if 'formula' has k elements between '~' and '|', only 1, 2, ..., k are allowed -# # but -5, 0, k+1, 1.5, etc. are not allowed +# # but -5, 0, k+1, 1.5, etc. are not allowed # if(!all(I.orig %in% 1:length(f1.no.y.string))){ # index.in <- I.orig %in% 1:length(f1.no.y.string) # logical index vector (problematic entries = FALSE) # stop(paste("'formula' was determined to have ", length(f1.no.y.string), @@ -371,12 +397,12 @@ return(I.c) # length(f1.no.y.string), # ") in the 'x + d'-part of 'formula'.", sep="")) # } -# +# # # I is now a formula as implied by the user-specified I, i.e. the 'formula' ''indexed'' by the user-specified I # I <- as.formula(paste("~ ", paste(f1.no.y.string[I], collapse = " + "), sep="")) # } -# -# +# +# # # error message if I is too long. # # formula-class: length must be 2 # # Formula-class: length must be c(0, 1) @@ -388,22 +414,22 @@ return(I.c) # if(length(I)[2] > 1L){ stop("'I' must be a one-sided formula, '~ x1 + x2' is ok, '~ x1 + x2 | z1 + z2' is not.") } # } # } -# -# +# +# # # names of the components in X that are selected via the argument I: # i <- attr(terms(as.formula(I)), "term.labels") -# +# # # error: if formula is like y ~ 1 + x1 + ..., or y ~ 0 + x1 + ..., or y ~ -1 + x1 + ..., # # and if I is ~ 1, or c(1), or c(TRUE, FALSE, FALSE, ...), -# # then I did not selected any terms/variables, but only the intercept. +# # then I did not selected any terms/variables, but only the intercept. # # In this case, the just constructed 'i' is 'character(0)', and operating on such an 'i' # # would yield an uninformative error. # # Instead so an informative error is produced in the next lines of code. # if(length(i) == 0L){ # stop("Argument 'I' did not select any (non-intercept) terms from 'formula'. Most likely reason: 'formula' explicitly addressed the intercept (i.e., -1, 0, 1) and this is also the only component from 'formula' that is selected via 'I'.") # } -# -# +# +# # # construction of X.I: the model matrix X with only those columns which are selected via I: # # (construction of X.I is analog to that of X, D, and Z) # conlist.X.I <- sapply(data, is.factor) # factor variables in data @@ -412,43 +438,51 @@ return(I.c) # X.I <- model.matrix(form.X.I, data = data, contrasts.arg = lapply(data[, conlist.X.I, drop = FALSE], contrasts, contrasts = !all.categories)) # X.I <- X.I[, -which(colnames(X.I) == "(Intercept)"), drop = FALSE] # attr(X.I, "assign") <- NULL -# -# +# +# # I.c <- which(colnames(X) %in% colnames) -# +# # # generate I.c = NULL in case no I is provided, so that I.c = NULL can be passed on to workhorse function # if(is.null(I)){ I.c <- NULL } -# +# # return(I.c) # } -# } +# } ########### - check_variables <- function(formula, varnames) { - - if (is.null(formula)) return(NULL) - - if("formula" %in% class(formula)){ - if(!("Formula" %in% class(formula))){ - if(length(formula) > 2L){ stop("'I' must be a one-sided formula, e.g. '~ x1 + x2'.") } - }else{ - if(length(formula)[1] > 0L){ stop("'I' must be a one-sided formula: '~ x1 + x2' is ok, 'y ~ x1 + x2' is not.") } - if(length(formula)[2] > 1L){ stop("'I' must be a one-sided formula, '~ x1 + x2' is ok, '~ x1 + x2 | z1 + z2' is not.") } +check_variables <- function(formula, varnames) { + + if (is.null(formula)) { + return(NULL) + } + + if ("formula" %in% class(formula)) { + if (!("Formula" %in% class(formula))) { + if (length(formula) > 2L) { + stop("'I' must be a one-sided formula, e.g. '~ x1 + x2'.") + } + } else { + if (length(formula)[1] > 0L) { + stop("'I' must be a one-sided formula: '~ x1 + x2' is ok, 'y ~ x1 + x2' is not.") + } + if (length(formula)[2] > 1L) { + stop("'I' must be a one-sided formula, '~ x1 + x2' is ok, '~ x1 + x2 | z1 + z2' is not.") } } - - i <- attr(terms(as.formula(formula)), "term.labels") - I.c <- match(i, varnames) - return(I.c) } + i <- attr(terms(as.formula(formula)), "term.labels") + I.c <- match(i, varnames) + return(I.c) +} + ################## function for checking of binary variable - + check_binary <- function(x) { - - if (!setequal(unique(x), c(0,1))) { - stop("Treatment variable and Instrumental Variable should be binary (0/1)!") + + if (!setequal(unique(x), c(0, 1))) { + stop("Treatment variable and Instrumental Variable should be binary (0/1)!") } return(invisible(NULL)) } @@ -458,21 +492,20 @@ constructIV <- function(firmid, cdid, id, X) { n <- dim(X)[1] p <- dim(X)[2] names <- colnames(X) - if (is.null(names)) names <- paste("V", 1:p, sep="") - sum.other <- matrix(NA, nrow = n, ncol = p) - colnames(sum.other) <- paste("sum.other.", names, sep="") - sum.rival <- matrix(NA, nrow = n, ncol = p) - colnames(sum.rival) <- paste("sum.rival.", names, sep="") - + if (is.null(names)) names <- paste("V", 1:p, sep = "") + sum.other <- matrix(NA, nrow = n, ncol = p) + colnames(sum.other) <- paste("sum.other.", names, sep = "") + sum.rival <- matrix(NA, nrow = n, ncol = p) + colnames(sum.rival) <- paste("sum.rival.", names, sep = "") + for (i in 1:n) { for (j in 1:p) { - other_ind=(firmid==firmid[i] & cdid==cdid[i] & id!=id[i]) - rival_ind=(firmid!=firmid[i] & cdid==cdid[i]) - - sum.other[i,j]=sum(X[other_ind==1,j]) - sum.rival[i,j]=sum(X[rival_ind==1,j]) + other_ind <- (firmid == firmid[i] & cdid == cdid[i] & id != id[i]) + rival_ind <- (firmid != firmid[i] & cdid == cdid[i]) + + sum.other[i, j] <- sum(X[other_ind == 1, j]) + sum.rival[i, j] <- sum(X[rival_ind == 1, j]) } } return(cbind(sum.other, sum.rival)) } - diff --git a/R/p_adjust.R b/R/p_adjust.R index 2893bb4..df17f75 100644 --- a/R/p_adjust.R +++ b/R/p_adjust.R @@ -1,65 +1,65 @@ -#'Multiple Testing Adjustment of p-values for S3 objects \code{rlassoEffects} -#'and \code{lm} +#' Multiple Testing Adjustment of p-values for S3 objects \code{rlassoEffects} +#' and \code{lm} #' -#'Multiple hypotheses testing adjustment of p-values from a high-dimensional -#'linear model. +#' Multiple hypotheses testing adjustment of p-values from a high-dimensional +#' linear model. #' -#'Multiple testing adjustment is performed for S3 objects of class -#'\code{rlassoEffects} and \code{lm}. Implemented methods for multiple testing -#'adjustment are Romano-Wolf stepdown '\code{RW}' (default) and the adjustment -#'methods available in the \code{p.adjust} function of the \code{stats} package, -#'including the Bonferroni, Bonferroni-Holm, and Benjamini-Hochberg corrections, -#'see \code{\link{p.adjust.methods}}. +#' Multiple testing adjustment is performed for S3 objects of class +#' \code{rlassoEffects} and \code{lm}. Implemented methods for multiple testing +#' adjustment are Romano-Wolf stepdown '\code{RW}' (default) and the adjustment +#' methods available in the \code{p.adjust} function of the \code{stats} package, +#' including the Bonferroni, Bonferroni-Holm, and Benjamini-Hochberg corrections, +#' see \code{\link{p.adjust.methods}}. #' -#'Objects of class \code{rlassoEffects} are constructed by -#'\code{\link{rlassoEffects}}. +#' Objects of class \code{rlassoEffects} are constructed by +#' \code{\link{rlassoEffects}}. #' -#'@param x an object of S3 class \code{rlassoEffects} or \code{lm}. -#'@param method the method of p-value adjustment for multiple testing. +#' @param x an object of S3 class \code{rlassoEffects} or \code{lm}. +#' @param method the method of p-value adjustment for multiple testing. #' Romano-Wolf stepdown ('\code{RW}') is chosen by default. -#'@param test.index vector of integers, logicals or variables names indicating +#' @param test.index vector of integers, logicals or variables names indicating #' the position of coefficients (integer case), logical vector of length of the #' coefficients (TRUE or FALSE) or the coefficient names of x which should be #' tested simultaneously (only for S3 class \code{lm}). If missing, all #' coefficients are considered. -#'@param B number of bootstrap repetitions (default 1000). -#'@param ... further arguments passed on to methods. -#'@rdname p_adjust -#'@aliases p_adjust.rlassoEffects p_adjust.lm -#'@return A matrix with the estimated coefficients and the p-values that are +#' @param B number of bootstrap repetitions (default 1000). +#' @param ... further arguments passed on to methods. +#' @rdname p_adjust +#' @aliases p_adjust.rlassoEffects p_adjust.lm +#' @return A matrix with the estimated coefficients and the p-values that are #' adjusted according to the specified method. -#'@references J.P. Romano, M. Wolf (2005). Exact and approximate stepdown +#' @references J.P. Romano, M. Wolf (2005). Exact and approximate stepdown #' methods for multiple hypothesis testing. Journal of the American Statistical #' Association, 100(469), 94-108. -#'@references J.P. Romano, M. Wolf (2016). Efficient computation of adjusted +#' @references J.P. Romano, M. Wolf (2016). Efficient computation of adjusted #' p-values for resampling-based stepdown multiple testing. Statistics and #' Probability Letters, (113), 38-40. -#'@references A. Belloni, V. Chernozhukov, K. Kato (2015). Uniform +#' @references A. Belloni, V. Chernozhukov, K. Kato (2015). Uniform #' post-selection inference for least absolute deviation regression and other #' Z-estimation problems. Biometrika, 102(1), 77-94. #' #' #' @examples -#' library(hdm); +#' library(hdm) #' set.seed(1) -#' n = 100 #sample size -#' p = 25 # number of variables -#' s = 3 # nubmer of non-zero variables -#' X = matrix(rnorm(n*p), ncol=p) -#' colnames(X) <- paste("X", 1:p, sep="") -#' beta = c(rep(3,s), rep(0,p-s)) -#' y = 1 + X%*%beta + rnorm(n) -#' data = data.frame(cbind(y,X)) +#' n <- 100 # sample size +#' p <- 25 # number of variables +#' s <- 3 # nubmer of non-zero variables +#' X <- matrix(rnorm(n * p), ncol = p) +#' colnames(X) <- paste("X", 1:p, sep = "") +#' beta <- c(rep(3, s), rep(0, p - s)) +#' y <- 1 + X %*% beta + rnorm(n) +#' data <- data.frame(cbind(y, X)) #' colnames(data)[1] <- "y" -#' lasso.effect = rlassoEffects(X, y, index=c(1:20)) -#' pvals.lasso.effect = p_adjust(lasso.effect, method = "RW", B = 1000) -#' ols = lm(y ~ -1 + X, data) -#' pvals.ols = p_adjust(ols, method = "RW", B = 1000) -#' pvals.ols = p_adjust(ols, method = "RW", B = 1000, test.index = c(1,2,5)) -#' pvals.ols = p_adjust(ols, method = "RW", B = 1000, test.index = c(rep(TRUE, 5), rep(FALSE, p-5))) +#' lasso.effect <- rlassoEffects(X, y, index = c(1:20)) +#' pvals.lasso.effect <- p_adjust(lasso.effect, method = "RW", B = 1000) +#' ols <- lm(y ~ -1 + X, data) +#' pvals.ols <- p_adjust(ols, method = "RW", B = 1000) +#' pvals.ols <- p_adjust(ols, method = "RW", B = 1000, test.index = c(1, 2, 5)) +#' pvals.ols <- p_adjust(ols, method = "RW", B = 1000, test.index = c(rep(TRUE, 5), rep(FALSE, p - 5))) #' @export -p_adjust = function(x, ...){ +p_adjust <- function(x, ...) { UseMethod("p_adjust") } @@ -89,41 +89,41 @@ p_adjust.rlassoEffects <- function(x, method = "RW", B = 1000, ...) { Omegahat <- matrix(NA, ncol = k, nrow = k) for (j in 1:k) { for (l in 1:k) { - Omegahat[j, l] = Omegahat[l, j] = 1/(Ev2[j] * Ev2[l]) * mean(ev[, j] * ev[, l]) + Omegahat[j, l] <- Omegahat[l, j] <- 1 / (Ev2[j] * Ev2[l]) * mean(ev[, j] * ev[, l]) } } se <- sqrt(diag(Omegahat)) Beta_i <- matrix(NA, ncol = k, nrow = B) for (i in 1:B) { - Beta_i[i, ] <- MASS::mvrnorm(mu = rep(0, k), Sigma = Omegahat/n) + Beta_i[i, ] <- MASS::mvrnorm(mu = rep(0, k), Sigma = Omegahat / n) } - tstats <- cf/se + tstats <- cf / se stepdown.index <- order(abs(tstats), decreasing = TRUE) ro <- order(stepdown.index) for (s in 1:k) { if (s == 1) { - sim <- apply(Beta_i, 1, function(z) max(abs(z)/se)) - pinit[s] <- pmin(1, (sum(sim >= abs(tstats[stepdown.index][s])))/B) + sim <- apply(Beta_i, 1, function(z) max(abs(z) / se)) + pinit[s] <- pmin(1, (sum(sim >= abs(tstats[stepdown.index][s]))) / B) } if (s > 1) { - sim <- apply(Beta_i[, -stepdown.index[1:(s - 1)], drop = F], 1, function(z) max(abs(z)/se[-stepdown.index[1:(s - 1)]])) - pinit[s] <- pmin(1, (sum(sim >= abs(tstats[stepdown.index][s])))/B) + sim <- apply(Beta_i[, -stepdown.index[1:(s - 1)], drop = F], 1, function(z) max(abs(z) / se[-stepdown.index[1:(s - 1)]])) + pinit[s] <- pmin(1, (sum(sim >= abs(tstats[stepdown.index][s]))) / B) } } - for (j in 1:k) { - if (j == 1) { - corr.padj[j] <- pinit[j] - } + for (j in 1:k) { + if (j == 1) { + corr.padj[j] <- pinit[j] + } - if (j > 1) { - corr.padj[j] <- max(pinit[j], corr.padj[j - 1]) - } + if (j > 1) { + corr.padj[j] <- max(pinit[j], corr.padj[j - 1]) } - pval <- corr.padj[ro] + } + pval <- corr.padj[ro] } res <- as.matrix(cbind(cf, pval)) @@ -135,13 +135,13 @@ p_adjust.rlassoEffects <- function(x, method = "RW", B = 1000, ...) { #' @describeIn p_adjust \code{\link[stats]{lm}}. #' @export -p_adjust.lm = function(x, method = "RW", B = 1000, test.index = NULL, ...) { +p_adjust.lm <- function(x, method = "RW", B = 1000, test.index = NULL, ...) { checkmate::checkClass(x, "lm") checkmate::checkChoice(method, c("RW", stats::p.adjust.methods)) checkmate::assert(checkmate::checkNull(test.index), checkmate::checkLogical(test.index), checkmate::checkNumeric(test.index), checkmate::checkCharacter(test.index)) - cf = coef(x) - pnames = names(cf) + cf <- coef(x) + pnames <- names(cf) if (is.null(test.index)) { k <- length(coef(x)) @@ -152,7 +152,6 @@ p_adjust.lm = function(x, method = "RW", B = 1000, test.index = NULL, ...) { stopifnot(length(test.index) == length(coef(x))) index <- which(test.index == T) } else { - if (is.numeric(test.index)) { index <- as.integer(test.index) stopifnot(all(test.index <= length(coef(x))) && length(test.index) <= length(coef(x))) @@ -187,12 +186,12 @@ p_adjust.lm = function(x, method = "RW", B = 1000, test.index = NULL, ...) { for (s in 1:k) { if (s == 1) { - sim <- apply(Beta_i, 1, function(z) max(abs(z)/se)) - pinit[s] <- pmin(1, (sum(sim >= abs(tstats[stepdown.index][s])))/B) + sim <- apply(Beta_i, 1, function(z) max(abs(z) / se)) + pinit[s] <- pmin(1, (sum(sim >= abs(tstats[stepdown.index][s]))) / B) } - if (s > 1 ) { - sim <- apply(Beta_i[, -stepdown.index[1:(s - 1)], drop = FALSE], 1, function(z) max(abs(z)/se[-stepdown.index[1:(s - 1)]])) - pinit[s] <- pmin(1, (sum(sim >= abs(tstats[stepdown.index][s])))/B) + if (s > 1) { + sim <- apply(Beta_i[, -stepdown.index[1:(s - 1)], drop = FALSE], 1, function(z) max(abs(z) / se[-stepdown.index[1:(s - 1)]])) + pinit[s] <- pmin(1, (sum(sim >= abs(tstats[stepdown.index][s]))) / B) } } @@ -214,4 +213,3 @@ p_adjust.lm = function(x, method = "RW", B = 1000, test.index = NULL, ...) { return(res) } - diff --git a/R/pkg-package.R b/R/pkg-package.R index 2d998dd..fa8e1e9 100644 --- a/R/pkg-package.R +++ b/R/pkg-package.R @@ -3,10 +3,10 @@ #' This package implements methods for estimation and inference in a high-dimensional setting. #' #' \tabular{ll}{ Package: \tab hdm\cr Type: \tab Package\cr Version: \tab -#' 0.1\cr Date: \tab 2015-05-25\cr License: \tab GPL-3\cr } This package provides efficient estimators -#' and uniformly valid confidence intervals for various low-dimensional causal/structural parameters +#' 0.1\cr Date: \tab 2015-05-25\cr License: \tab GPL-3\cr } This package provides efficient estimators +#' and uniformly valid confidence intervals for various low-dimensional causal/structural parameters #' appearing in high-dimensional approximately sparse models. The package -#' includes functions for fitting heteroskedastic robust Lasso regressions with non-Gaussian erros and +#' includes functions for fitting heteroskedastic robust Lasso regressions with non-Gaussian erros and #' for instrumental variable (IV) and treatment effect estimation in a #' high-dimensional setting. Moreover, the methods enable valid post-selection #' inference. Moreover, a theoretically grounded, data-driven choice of the penalty level is provided. @@ -29,7 +29,7 @@ #' Review of Economic Studies 81(2), 608-650. #' @importFrom stats binomial #' @importFrom stats coef -#' @importFrom stats confint +#' @importFrom stats confint #' @importFrom stats cor #' @importFrom stats glm #' @importFrom stats lm @@ -141,11 +141,11 @@ NULL #' #' Dataset on settler mortality. #' -#' Data set was analysed in Acemoglu et al. (2001). A detailed description of the data can be found at \url{http://economics.mit.edu/faculty/acemoglu/data/ajr2001} +#' Data set was analysed in Acemoglu et al. (2001). A detailed description of the data can be found at \url{http://economics.mit.edu/faculty/acemoglu/data/ajr2001} #' #' @name AJR #' @docType data -#' @format \describe{ +#' @format \describe{ #' \item{Mort}{Settler mortality} #' \item{logMort}{logarithm of Mort} #' \item{Latitude}{Latitude} @@ -156,7 +156,7 @@ NULL #' \item{Samer}{South America} #' \item{Neo}{Neo-Europes} #' \item{GDP}{GDP} -#' \item{Exprop}{Average protection against expropriation risk} +#' \item{Exprop}{Average protection against expropriation risk} #' } #' @references D. Acemoglu, S. Johnson, J. A. Robinson (2001). Colonial origins of comparative development: an empirical investigation. #' American Economic Review, 91, 1369--1401. @@ -169,15 +169,15 @@ NULL #' #' Dataset on judicial eminent domain decisions. #' -#' Data set was analyzed in Belloni et al. (2012). They estimate the effect of judicial eminent domain decisions on economic outcomes with instrumental variables (IV) in a setting high a large set of potential IVs. -#' A detailed decription of the data can be found at -#' \url{https://www.econometricsociety.org/publications/econometrica/2012/11/01/sparse-models-and-methods-optimal-instruments-application} -#' The data set contains four "sub-data sets" which differ mainly in the dependent variables: repeat-sales FHFA/OFHEO house price index for metro (FHFA) and non-metro (NM) area, the Case-Shiller home price index (CS), +#' Data set was analyzed in Belloni et al. (2012). They estimate the effect of judicial eminent domain decisions on economic outcomes with instrumental variables (IV) in a setting high a large set of potential IVs. +#' A detailed decription of the data can be found at +#' \url{https://www.econometricsociety.org/publications/econometrica/2012/11/01/sparse-models-and-methods-optimal-instruments-application} +#' The data set contains four "sub-data sets" which differ mainly in the dependent variables: repeat-sales FHFA/OFHEO house price index for metro (FHFA) and non-metro (NM) area, the Case-Shiller home price index (CS), #' and state-level GDP from the Bureau of Economic Analysis - all transformed with the logarithm. The structure of each subdata set is given above. #' In the data set the following variables and name conventions are used: -#' "numpanelskx_..." is the number of panels with at least k members with the characteristic following the "_". -#' The probability controls (names start with "F_prob_") follow a similar naming convention and give the probability of observing a panel with characteristic given following second "_" given the characteristics of the pool of judges available to be assigned to the case. -#' +#' "numpanelskx_..." is the number of panels with at least k members with the characteristic following the "_". +#' The probability controls (names start with "F_prob_") follow a similar naming convention and give the probability of observing a panel with characteristic given following second "_" given the characteristics of the pool of judges available to be assigned to the case. +#' #' Characteristics in the data for the control variables or instruments: #' \describe{ #' \item{noreligion}{judge reports no religious affiliation} @@ -198,10 +198,10 @@ NULL #' \item{circuit}{dummy for the circuit level (reference category excluded)} #' \item{missing_cy_12}{a dummy for whether there were no cases in that circuit-year} #' \item{numcasecat_12}{the number of takings appellate decisions} -#'} +#' } #' @name EminentDomain #' @docType data -#' @format \describe{ +#' @format \describe{ #' \item{y}{economic outcome variable} #' \item{x}{set of exogenous variables} #' \item{d}{eminent domain decisions} @@ -219,13 +219,13 @@ NULL #' #' Automobile data set from the US. #' -#' Data set was analysed in Berry, Levinsohn and Pakes (1995). The data stem from annual issues of the Automotive News Market Data Book. +#' Data set was analysed in Berry, Levinsohn and Pakes (1995). The data stem from annual issues of the Automotive News Market Data Book. #' The data set inlcudes information on all models marketed during the the period beginning 1971 and ending in 1990 cotaining 2217 model/years from 997 distinct models. #' A detailed description is given in BLP (1995, 868--871). The internal function \code{constructIV} constructs instrumental variables along the lines described and used in BLP (1995). #' #' @name BLP #' @docType data -#' @format \describe{ +#' @format \describe{ #' \item{model.name}{model name} #' \item{model.id}{model id} #' \item{firm.id}{firm id} @@ -252,13 +252,13 @@ NULL #' #' Census data from the US for the year 2012. #' -#' The CPS is a monthly U.S. household survey conducted jointly by the U.S. Census Bureau and the Bureau of Labor Statistics. The data comprise the year 2012. -#' This data set was used in Mulligan and Rubinstein (2008). -#' The sample comprises white non-hipanic, ages 25-54, working full time full year (35+ hours per week at least 50 weeks), exclude living in group quarters, +#' The CPS is a monthly U.S. household survey conducted jointly by the U.S. Census Bureau and the Bureau of Labor Statistics. The data comprise the year 2012. +#' This data set was used in Mulligan and Rubinstein (2008). +#' The sample comprises white non-hipanic, ages 25-54, working full time full year (35+ hours per week at least 50 weeks), exclude living in group quarters, #' self-employed, military, agricultural, and private household sector, allocated earning, inconsistent report on earnings and employment, missing data. #' @name cps2012 #' @docType data -#' @format \describe{ +#' @format \describe{ #' \item{lnw}{log of hourly wage (annual earnings / annual hours)} #' \item{female}{female indicator} #' \item{married status}{ six indicators: widowed, divorced, separated, nevermarried, and married (omitted)} @@ -272,4 +272,4 @@ NULL #' @keywords datasets #' @examples #' data(BLP) -NULL \ No newline at end of file +NULL diff --git a/R/rlasso.R b/R/rlasso.R index 8e18fcd..7d94423 100644 --- a/R/rlasso.R +++ b/R/rlasso.R @@ -1,4 +1,4 @@ -globalVariables(c("post", "intercept", "penalty", "control", "error", "n", "select.Z" , "select.X", "aes", "element_blank", "scale_x_discrete", "model.part", "all.categories", "X")) +globalVariables(c("post", "intercept", "penalty", "control", "error", "n", "select.Z", "select.X", "aes", "element_blank", "scale_x_discrete", "model.part", "all.categories", "X")) #' rlasso: Function for Lasso estimation under homoscedastic and heteroscedastic non-Gaussian #' disturbances @@ -11,7 +11,7 @@ globalVariables(c("post", "intercept", "penalty", "control", "error", "n", "sele #' The function estimates the coefficients of a Lasso regression with #' data-driven penalty under homoscedasticity / heteroscedasticity and non-Gaussian noise. The options \code{homoscedastic} is a logical with \code{FALSE} by default. #' Moreover, for the calculation of the penalty parameter it can be chosen, if the penalization parameter depends on the design matrix (\code{X.dependent.lambda=TRUE}) or \code{independent} (default, \code{X.dependent.lambda=FALSE}). -#' The default value of the constant \code{c} is \code{1.1} in the post-Lasso case and \code{0.5} in the Lasso case. +#' The default value of the constant \code{c} is \code{1.1} in the post-Lasso case and \code{0.5} in the Lasso case. #' A \emph{special} option is to set \code{homoscedastic} to \code{none} and to supply a values \code{lambda.start}. Then this value is used as penalty parameter with independent design and heteroscedastic errors to weight the regressors. #' For details of the #' implementation of the Algorithm for estimation of the data-driven penalty, @@ -19,7 +19,7 @@ globalVariables(c("post", "intercept", "penalty", "control", "error", "n", "sele #' Belloni et al. (2012). When the option "none" is chosen for \code{homoscedastic} (together with #' \code{lambda.start}), lambda is set to \code{lambda.start} and the #' regressor-independent loadings und heteroscedasticity are used. The options "X-dependent" and -#' "X-independent" under homoscedasticity are described in Belloni et al. (2013). +#' "X-independent" under homoscedasticity are described in Belloni et al. (2013). # \code{lambda.start} can be component-specific. When used with one of the # other option, the values are used as starting values. #' @@ -31,7 +31,7 @@ globalVariables(c("post", "intercept", "penalty", "control", "error", "n", "sele #' @param intercept logical. If \code{TRUE}, intercept is included which is not #' penalized. #' @param model logical. If \code{TRUE} (default), model matrix is returned. -#' @param penalty list with options for the calculation of the penalty. +#' @param penalty list with options for the calculation of the penalty. #' \itemize{ #' \item{\code{c} and \code{gamma}}{ constants for the penalty with default \code{c=1.1} and \code{gamma=0.1}} #' \item{\code{homoscedastic}}{ logical, if homoscedastic errors are considered (default \code{FALSE}). Option \code{none} is described below.} @@ -43,7 +43,7 @@ globalVariables(c("post", "intercept", "penalty", "control", "error", "n", "sele #' \code{numIter} number of iterations for the algorithm for #' the estimation of the variance and data-driven penalty, ie. loadings, #' \code{tol} tolerance for improvement of the estimated variances. -#'\code{threshold} is applied to the final estimated lasso +#' \code{threshold} is applied to the final estimated lasso #' coefficients. Absolute values below the threshold are set to zero. #' @param ... further arguments (only for consistent defintion of methods) #' @return \code{rlasso} returns an object of class \code{rlasso}. An object of @@ -66,25 +66,25 @@ globalVariables(c("post", "intercept", "penalty", "control", "error", "n", "sele #' high-dimensional sparse econometric models. In Advances in Economics and #' Econometrics: 10th World Congress, Vol. 3: Econometrics, Cambirdge #' University Press: Cambridge, 245-295. -#' @examples +#' @examples #' set.seed(1) -#' n = 100 #sample size -#' p = 100 # number of variables -#' s = 3 # nubmer of variables with non-zero coefficients -#' X = Xnames = matrix(rnorm(n*p), ncol=p) -#' colnames(Xnames) <- paste("V", 1:p, sep="") -#' beta = c(rep(5,s), rep(0,p-s)) -#' Y = X%*%beta + rnorm(n) -#' reg.lasso <- rlasso(Y~Xnames) -#' Xnew = matrix(rnorm(n*p), ncol=p) # new X -#' colnames(Xnew) <- paste("V", 1:p, sep="") -#' Ynew = Xnew%*%beta + rnorm(n) #new Y -#' yhat = predict(reg.lasso, newdata = Xnew) +#' n <- 100 # sample size +#' p <- 100 # number of variables +#' s <- 3 # nubmer of variables with non-zero coefficients +#' X <- Xnames <- matrix(rnorm(n * p), ncol = p) +#' colnames(Xnames) <- paste("V", 1:p, sep = "") +#' beta <- c(rep(5, s), rep(0, p - s)) +#' Y <- X %*% beta + rnorm(n) +#' reg.lasso <- rlasso(Y ~ Xnames) +#' Xnew <- matrix(rnorm(n * p), ncol = p) # new X +#' colnames(Xnew) <- paste("V", 1:p, sep = "") +#' Ynew <- Xnew %*% beta + rnorm(n) # new Y +#' yhat <- predict(reg.lasso, newdata = Xnew) #' @export #' @rdname rlasso rlasso <- function(x, ...) { UseMethod("rlasso") # definition generic function - } +} #' @param formula an object of class "formula" (or one that can be coerced to #' that class): a symbolic description of the model to be fitted in the form #' \code{y~x} @@ -94,11 +94,11 @@ rlasso <- function(x, ...) { #' typically the environment from which \code{rlasso} is called. #' @rdname rlasso #' @export -rlasso.formula <- function(formula, data = NULL, post = TRUE, intercept = TRUE, model = TRUE, - penalty = list(homoscedastic = FALSE, X.dependent.lambda = FALSE, lambda.start = NULL, c = 1.1, gamma = .1/log(n)), - control = list(numIter = 15, tol = 10^-5, threshold = NULL), ...) { +rlasso.formula <- function(formula, data = NULL, post = TRUE, intercept = TRUE, model = TRUE, + penalty = list(homoscedastic = FALSE, X.dependent.lambda = FALSE, lambda.start = NULL, c = 1.1, gamma = .1 / log(n)), + control = list(numIter = 15, tol = 10^-5, threshold = NULL), ...) { cl <- match.call() - #if (missing(data)) data <- environment(formula) + # if (missing(data)) data <- environment(formula) mf <- match.call(expand.dots = FALSE) m <- match(c("formula", "data"), names(mf), 0L) mf <- mf[c(1L, m)] @@ -109,40 +109,47 @@ rlasso.formula <- function(formula, data = NULL, post = TRUE, intercept = TRUE, attr(mt, "intercept") <- 1 y <- model.response(mf, "numeric") n <- length(y) - x <- model.matrix(mt, mf)[,-1, drop=FALSE] + x <- model.matrix(mt, mf)[, -1, drop = FALSE] if (missing(data)) { - if (is.call(formula[[3]])) { - #colnames(x) <- sub(format(formula[[3]]), "", colnames(x)) - colnames(x) <- sub(re.escape(format(formula[[3]])), "", colnames(x)) + if (is.call(formula[[3]])) { + # colnames(x) <- sub(format(formula[[3]]), "", colnames(x)) + colnames(x) <- sub(re.escape(format(formula[[3]])), "", colnames(x)) } else { - colnames(x) <- sub(re.escape(formula[[3]]), "", colnames(x)) + colnames(x) <- sub(re.escape(formula[[3]]), "", colnames(x)) } } - est <- rlasso(x, y, post = post, intercept = intercept, penalty=penalty, model=model, - control = control) + est <- rlasso(x, y, + post = post, intercept = intercept, penalty = penalty, model = model, + control = control + ) est$call <- cl return(est) } #' @rdname rlasso #' @export -rlasso.character <- function(x, data = NULL, post = TRUE, intercept = TRUE, model = TRUE, - penalty = list(homoscedastic = FALSE, X.dependent.lambda = FALSE, lambda.start = NULL, c = 1.1, gamma = .1/log(n)), +rlasso.character <- function(x, data = NULL, post = TRUE, intercept = TRUE, model = TRUE, + penalty = list(homoscedastic = FALSE, X.dependent.lambda = FALSE, lambda.start = NULL, c = 1.1, gamma = .1 / log(n)), control = list(numIter = 15, tol = 10^-5, threshold = NULL), ...) { formula <- as.formula(x) - if (missing(penalty)) - rlasso.formula(formula, data = data, post = post, intercept = intercept, model = model, - control = control, ...) - else - rlasso.formula(x, x, data = NULL, post = TRUE, intercept = TRUE, model = TRUE, - penalty = penalty, - control = list(numIter = 15, tol = 10^-5, threshold = NULL), ...) + if (missing(penalty)) { + rlasso.formula(formula, + data = data, post = post, intercept = intercept, model = model, + control = control, ... + ) + } else { + rlasso.formula(x, x, + data = NULL, post = TRUE, intercept = TRUE, model = TRUE, + penalty = penalty, + control = list(numIter = 15, tol = 10^-5, threshold = NULL), ... + ) + } } -# rlasso.character <- function(x, data = NULL, post = TRUE, intercept = TRUE, model = TRUE, +# rlasso.character <- function(x, data = NULL, post = TRUE, intercept = TRUE, model = TRUE, # penalty = list(homoscedastic = FALSE, X.dependent.lambda = FALSE, lambda.start = NULL, c = 1.1, gamma = .1/log(n)), # control = list(numIter = 15, tol = 10^-5, threshold = NULL), ...) { # formula <- as.formula(x) -# res <- rlasso.formula(formula, data = data, post = post, intercept = intercept, model = model, +# res <- rlasso.formula(formula, data = data, post = post, intercept = intercept, model = model, # penalty = penalty, control = control, ...) # } @@ -151,43 +158,44 @@ rlasso.character <- function(x, data = NULL, post = TRUE, intercept = TRUE, mode #' @export #' @param y dependent variable (vector, matrix or object can be coerced to matrix) #' @param x regressors (vector, matrix or object can be coerced to matrix) -rlasso.default <- function(x, y, post = TRUE, intercept = TRUE, model = TRUE, - penalty = list(homoscedastic = FALSE, X.dependent.lambda = FALSE, lambda.start = NULL, c = 1.1, gamma = .1/log(n)), - control = list(numIter = 15, tol = 10^-5, threshold = NULL),...) { +rlasso.default <- function(x, y, post = TRUE, intercept = TRUE, model = TRUE, + penalty = list(homoscedastic = FALSE, X.dependent.lambda = FALSE, lambda.start = NULL, c = 1.1, gamma = .1 / log(n)), + control = list(numIter = 15, tol = 10^-5, threshold = NULL), ...) { x <- as.matrix(x) y <- as.matrix(y) - + n <- dim(x)[1] p <- dim(x)[2] - - if (is.null(colnames(x))) + + if (is.null(colnames(x))) { colnames(x) <- paste("V", 1:p, sep = "") + } ind.names <- 1:p # set options to default values if missing - if (!exists("homoscedastic", where = penalty)) penalty$homoscedastic = "FALSE" - if (!exists("X.dependent.lambda", where = penalty)) penalty$X.dependent.lambda = "FALSE" - if (!exists("gamma", where = penalty)) penalty$gamma = 0.1/log(n) - - if (penalty$homoscedastic=="none" & !exists("lambda.start", where=penalty)) stop("lambda.start must be provided!") + if (!exists("homoscedastic", where = penalty)) penalty$homoscedastic <- "FALSE" + if (!exists("X.dependent.lambda", where = penalty)) penalty$X.dependent.lambda <- "FALSE" + if (!exists("gamma", where = penalty)) penalty$gamma <- 0.1 / log(n) + + if (penalty$homoscedastic == "none" & !exists("lambda.start", where = penalty)) stop("lambda.start must be provided!") # checking input numIter, tol if (!exists("numIter", where = control)) { - control$numIter = 15 + control$numIter <- 15 } - + if (!exists("tol", where = control)) { - control$tol = 10^-5 + control$tol <- 10^-5 } - - #if (post==FALSE & (!exists("c", where = penalty) | is.na(match("penalty", names(as.list(match.call)))))) { - if (post==FALSE & (!exists("c", where = penalty))) { - penalty$c = 0.5 + + # if (post==FALSE & (!exists("c", where = penalty) | is.na(match("penalty", names(as.list(match.call)))))) { + if (post == FALSE & (!exists("c", where = penalty))) { + penalty$c <- 0.5 } - - default_pen <- list(homoscedastic = FALSE, X.dependent.lambda = FALSE, lambda.start = NULL, c = 1.1, gamma = .1/log(n)) - if (post==FALSE & isTRUE(all.equal(penalty, default_pen))) { - penalty$c = 0.5 + + default_pen <- list(homoscedastic = FALSE, X.dependent.lambda = FALSE, lambda.start = NULL, c = 1.1, gamma = .1 / log(n)) + if (post == FALSE & isTRUE(all.equal(penalty, default_pen))) { + penalty$c <- 0.5 } - + # Intercept handling and scaling if (intercept) { meanx <- colMeans(x) @@ -198,15 +206,15 @@ rlasso.default <- function(x, y, post = TRUE, intercept = TRUE, model = TRUE, meanx <- rep(0, p) mu <- 0 } - + normx <- sqrt(apply(x, 2, var)) - Psi <- apply(x, 2, function(x) mean(x^2)) + Psi <- apply(x, 2, function(x) mean(x^2)) ind <- rep(FALSE, p) # - + # variables with low variation are taken out, because normalization is not reliable # eps <- 10^-9 # precision for scaling - #ind <- which(normx < eps) - #if (length(ind) != 0) { + # ind <- which(normx < eps) + # if (length(ind) != 0) { # x <- x[, -ind] # normx <- normx[-ind] # ind.names <- ind.names[-ind] @@ -214,59 +222,63 @@ rlasso.default <- function(x, y, post = TRUE, intercept = TRUE, model = TRUE, # if (!is.null(penalty$lambda.start)) { # penalty$lambda.start <- penalty$lambda.start[-ind] # } - #} - + # } + # - + XX <- crossprod(x) Xy <- crossprod(x, y) - - startingval <- init_values(x,y)$residuals + + startingval <- init_values(x, y)$residuals pen <- lambdaCalculation(penalty = penalty, y = startingval, x = x) lambda <- pen$lambda Ups0 <- Ups1 <- pen$Ups0 lambda0 <- pen$lambda0 - + mm <- 1 s0 <- sqrt(var(y)) - + while (mm <= control$numIter) { # calculation parameters - #coefTemp <- LassoShooting.fit(x, y, lambda, XX = XX, Xy = Xy)$coefficients - #xn <- t(t(x)/as.vector(Ups1)) - if (mm==1 && post) { - coefTemp <- LassoShooting.fit(x, y, lambda/2, XX = XX, Xy = Xy)$coefficients - #lasso.reg <- glmnet::glmnet(xn, y, family = c("gaussian"), alpha = 1, + # coefTemp <- LassoShooting.fit(x, y, lambda, XX = XX, Xy = Xy)$coefficients + # xn <- t(t(x)/as.vector(Ups1)) + if (mm == 1 && post) { + coefTemp <- LassoShooting.fit(x, y, lambda / 2, XX = XX, Xy = Xy)$coefficients + # lasso.reg <- glmnet::glmnet(xn, y, family = c("gaussian"), alpha = 1, # lambda = lambda0/(2*n)/2, standardize = FALSE, intercept = FALSE) - #lasso.reg <- glmnet::glmnet(x, y, family = c("gaussian"), alpha = 1, + # lasso.reg <- glmnet::glmnet(x, y, family = c("gaussian"), alpha = 1, # lambda = lambda0/(2*n)/2, standardize = FALSE, intercept = FALSE, penalty.factor = Ups1) } else { coefTemp <- LassoShooting.fit(x, y, lambda, XX = XX, Xy = Xy)$coefficients - #lasso.reg <- glmnet::glmnet(xn, y, family = c("gaussian"), alpha = 1, + # lasso.reg <- glmnet::glmnet(xn, y, family = c("gaussian"), alpha = 1, # lambda = lambda0/(2*n), standardize = FALSE, intercept = FALSE) - #lasso.reg <- glmnet::glmnet(x, y, family = c("gaussian"), alpha = 1, + # lasso.reg <- glmnet::glmnet(x, y, family = c("gaussian"), alpha = 1, # lambda = lambda0/(2*n), standardize = FALSE, intercept = FALSE, penalty.factor = Ups1) } - #coefTemp <- as.vector(lasso.reg$beta) - #names(coefTemp) <- colnames(x) + # coefTemp <- as.vector(lasso.reg$beta) + # names(coefTemp) <- colnames(x) coefTemp[is.na(coefTemp)] <- 0 ind1 <- (abs(coefTemp) > 0) x1 <- as.matrix(x[, ind1, drop = FALSE]) if (dim(x1)[2] == 0) { if (intercept) { intercept.value <- mean(y + mu) - coef <- rep(0,p+1) - names(coef) <- c("intercept", colnames(x)) #c("intercept", names(coefTemp)) + coef <- rep(0, p + 1) + names(coef) <- c("intercept", colnames(x)) # c("intercept", names(coefTemp)) } else { intercept.value <- mean(y) - coef <- rep(0,p) - names(coef) <- colnames(x) #names(coefTemp) + coef <- rep(0, p) + names(coef) <- colnames(x) # names(coefTemp) } - est <- list(coefficients = coef, beta=rep(0,p), intercept=intercept.value, index = rep(FALSE, p), - lambda = lambda, lambda0 = lambda0, loadings = Ups0, residuals = y - - mean(y), sigma = var(y), iter = mm, call = match.call(), - options = list(post = post, intercept = intercept, ind.scale=ind, - control = control, mu = mu, meanx = meanx)) + est <- list( + coefficients = coef, beta = rep(0, p), intercept = intercept.value, index = rep(FALSE, p), + lambda = lambda, lambda0 = lambda0, loadings = Ups0, residuals = y - + mean(y), sigma = var(y), iter = mm, call = match.call(), + options = list( + post = post, intercept = intercept, ind.scale = ind, + control = control, mu = mu, meanx = meanx + ) + ) if (model) { est$model <- x } else { @@ -277,7 +289,7 @@ rlasso.default <- function(x, y, post = TRUE, intercept = TRUE, model = TRUE, class(est) <- "rlasso" return(est) } - + # refinement variance estimation if (post) { reg <- lm(y ~ -1 + x1) @@ -290,50 +302,50 @@ rlasso.default <- function(x, y, post = TRUE, intercept = TRUE, model = TRUE, e1 <- y - x1 %*% coefTemp[ind1] } s1 <- sqrt(var(e1)) - + # homoscedatic and X-independent if (penalty$homoscedastic == TRUE && penalty$X.dependent.lambda == FALSE) { - Ups1 <- c(s1)*Psi - #lambda <- rep(pen$lambda0 * s1, p) - lambda <- pen$lambda0*Ups1 + Ups1 <- c(s1) * Psi + # lambda <- rep(pen$lambda0 * s1, p) + lambda <- pen$lambda0 * Ups1 } # homoscedatic and X-dependent if (penalty$homoscedastic == TRUE && penalty$X.dependent.lambda == TRUE) { - Ups1 <- c(s1)*Psi - #lambda <- rep(pen$lambda0 * s1, p) + Ups1 <- c(s1) * Psi + # lambda <- rep(pen$lambda0 * s1, p) lambda <- pen$lambda0 * Ups1 } # heteroscedastic and X-independent if (penalty$homoscedastic == FALSE && penalty$X.dependent.lambda == FALSE) { - Ups1 <- 1/sqrt(n) * sqrt(t(t(e1^2) %*% (x^2))) + Ups1 <- 1 / sqrt(n) * sqrt(t(t(e1^2) %*% (x^2))) lambda <- pen$lambda0 * Ups1 } - + # heteroscedastic and X-dependent if (penalty$homoscedastic == FALSE && penalty$X.dependent.lambda == TRUE) { - lc <- lambdaCalculation(penalty, y=e1, x=x) + lc <- lambdaCalculation(penalty, y = e1, x = x) Ups1 <- lc$Ups0 lambda <- lc$lambda } - - - + + + # none if (penalty$homoscedastic == "none") { if (is.null(penalty$lambda.start)) stop("Argument lambda.start required!") - Ups1 <- 1/sqrt(n) * sqrt(t(t(e1^2) %*% (x^2))) + Ups1 <- 1 / sqrt(n) * sqrt(t(t(e1^2) %*% (x^2))) lambda <- pen$lambda0 * Ups1 } - + mm <- mm + 1 if (abs(s0 - s1) < control$tol) { break } s0 <- s1 } - + if (dim(x1)[2] == 0) { - coefTemp = NULL + coefTemp <- NULL ind1 <- rep(0, p) } coefTemp <- as.vector(coefTemp) @@ -342,35 +354,39 @@ rlasso.default <- function(x, y, post = TRUE, intercept = TRUE, model = TRUE, coefTemp <- as.vector(as.vector(coefTemp)) names(coefTemp) <- names(ind1) <- colnames(x) if (intercept) { - if (is.null(mu)) mu <-0 - if (is.null(meanx)) meanx <- rep(0, length(coefTemp)) #<- 0 - if (sum(ind)==0) { - intercept.value <- mu - sum(meanx*coefTemp) + if (is.null(mu)) mu <- 0 + if (is.null(meanx)) meanx <- rep(0, length(coefTemp)) # <- 0 + if (sum(ind) == 0) { + intercept.value <- mu - sum(meanx * coefTemp) } else { - intercept.value <- mu - sum(meanx*coefTemp) #sum(meanx[-ind]*coefTemp) + intercept.value <- mu - sum(meanx * coefTemp) # sum(meanx[-ind]*coefTemp) } } else { intercept.value <- NA } - - #if (intercept) { - # e1 <- y - x1 %*% coefTemp[ind1] - intercept.value - #} else { + + # if (intercept) { + # e1 <- y - x1 %*% coefTemp[ind1] - intercept.value + # } else { # e1 <- y - x1 %*% coefTemp[ind1] - #} + # } if (intercept) { beta <- c(intercept.value, coefTemp) names(beta)[1] <- "(Intercept)" } else { beta <- coefTemp } - + s1 <- sqrt(var(e1)) - est <- list(coefficients = beta, beta=coefTemp, intercept=intercept.value, index = ind1, lambda = lambda, - lambda0 = lambda0, loadings = Ups1, residuals = as.vector(e1), sigma = s1, - iter = mm, call = match.call(), options = list(post = post, intercept = intercept, - control = control, penalty = penalty, ind.scale=ind, - mu = mu, meanx = meanx), model=model) + est <- list( + coefficients = beta, beta = coefTemp, intercept = intercept.value, index = ind1, lambda = lambda, + lambda0 = lambda0, loadings = Ups1, residuals = as.vector(e1), sigma = s1, + iter = mm, call = match.call(), options = list( + post = post, intercept = intercept, + control = control, penalty = penalty, ind.scale = ind, + mu = mu, meanx = meanx + ), model = model + ) if (model) { x <- scale(x, -meanx, FALSE) est$model <- x @@ -391,7 +407,7 @@ rlasso.default <- function(x, y, post = TRUE, intercept = TRUE, model = TRUE, #' #' This function implements different methods for calculation of the penalization parameter \eqn{\lambda}. Further details can be found under \link{rlasso}. #' -#' @param penalty list with options for the calculation of the penalty. +#' @param penalty list with options for the calculation of the penalty. #' \itemize{ #' \item{\code{c} and \code{gamma}}{ constants for the penalty with default \code{c=1.1} and \code{gamma=0.1}} #' \item{\code{homoscedastic}}{ logical, if homoscedastic errors are considered (default \code{FALSE}). Option \code{none} is described below.} @@ -410,30 +426,30 @@ lambdaCalculation <- function(penalty = list(homoscedastic = FALSE, X.dependent. y = NULL, x = NULL) { checkmate::checkChoice(penalty$X.dependent.lambda, c(TRUE, FALSE, NULL)) checkmate::checkChoice(penalty$homoscedastic, c(TRUE, FALSE, "none")) - if (!exists("homoscedastic", where = penalty)) penalty$homoscedastic = "FALSE" - if (!exists("X.dependent.lambda", where = penalty)) penalty$X.dependent.lambda = "FALSE" - if (!exists("c", where = penalty) & penalty$homoscedastic!="none") { - penalty$c = 1.1 + if (!exists("homoscedastic", where = penalty)) penalty$homoscedastic <- "FALSE" + if (!exists("X.dependent.lambda", where = penalty)) penalty$X.dependent.lambda <- "FALSE" + if (!exists("c", where = penalty) & penalty$homoscedastic != "none") { + penalty$c <- 1.1 } - if (!exists("gamma", where = penalty) & penalty$homoscedastic!="none") { - penalty$gamma = 0.1 + if (!exists("gamma", where = penalty) & penalty$homoscedastic != "none") { + penalty$gamma <- 0.1 } # homoscedastic and X-independent - if (penalty$homoscedastic==TRUE && penalty$X.dependent.lambda == FALSE) { + if (penalty$homoscedastic == TRUE && penalty$X.dependent.lambda == FALSE) { p <- dim(x)[2] n <- dim(x)[1] - lambda0 <- 2 * penalty$c * sqrt(n) * qnorm(1 - penalty$gamma/(2 * - p)) + lambda0 <- 2 * penalty$c * sqrt(n) * qnorm(1 - penalty$gamma / (2 * + p)) Ups0 <- sqrt(var(y)) lambda <- rep(lambda0 * Ups0, p) } # homoscedastic and X-dependent - if (penalty$homoscedastic==TRUE && penalty$X.dependent.lambda == TRUE) { + if (penalty$homoscedastic == TRUE && penalty$X.dependent.lambda == TRUE) { if (!exists("numSim", where = penalty)) { - penalty$numSim = 5000 + penalty$numSim <- 5000 } p <- dim(x)[2] n <- dim(x)[1] @@ -445,42 +461,42 @@ lambdaCalculation <- function(penalty = list(homoscedastic = FALSE, X.dependent. # psi <- apply(x, 2, function(x) mean(x^2)) # sim[l] <- n * max(2 * abs(colMeans(t(t(x)/sqrt(psi)) * g))) # } - + psi <- apply(x, 2, function(x) mean(x^2)) - tXtpsi <- t(t(x)/sqrt(psi)) - - for (l in 1:R) { - g <- matrix(rep(rnorm(n), each = p), ncol = p, byrow = TRUE) - sim[l] <- n * max(2 * abs(colMeans(tXtpsi * g))) - } - + tXtpsi <- t(t(x) / sqrt(psi)) + + for (l in 1:R) { + g <- matrix(rep(rnorm(n), each = p), ncol = p, byrow = TRUE) + sim[l] <- n * max(2 * abs(colMeans(tXtpsi * g))) + } + lambda0 <- penalty$c * quantile(sim, probs = 1 - penalty$gamma) Ups0 <- sqrt(var(y)) lambda <- rep(lambda0 * Ups0, p) } # heteroscedastic and X-independent (was "standard") - if (penalty$homoscedastic==FALSE && penalty$X.dependent.lambda == FALSE) { + if (penalty$homoscedastic == FALSE && penalty$X.dependent.lambda == FALSE) { p <- dim(x)[2] n <- dim(x)[1] - #lambda0 <- 2*penalty$c*sqrt(n)*sqrt(2*log(2*p*log(n)/penalty$gamma)) - lambda0 <- 2 * penalty$c * sqrt(n) * qnorm(1 - penalty$gamma/(2 * - p * 1)) # 1=num endogenous variables - Ups0 <- 1/sqrt(n) * sqrt(t(t(y^2) %*% (x^2))) + # lambda0 <- 2*penalty$c*sqrt(n)*sqrt(2*log(2*p*log(n)/penalty$gamma)) + lambda0 <- 2 * penalty$c * sqrt(n) * qnorm(1 - penalty$gamma / (2 * + p * 1)) # 1=num endogenous variables + Ups0 <- 1 / sqrt(n) * sqrt(t(t(y^2) %*% (x^2))) lambda <- lambda0 * Ups0 } - + # heteroscedastic and X-dependent - if (penalty$homoscedastic==FALSE && penalty$X.dependent.lambda == TRUE) { + if (penalty$homoscedastic == FALSE && penalty$X.dependent.lambda == TRUE) { if (!exists("numSim", where = penalty)) { - penalty$numSim = 5000 + penalty$numSim <- 5000 } p <- dim(x)[2] n <- dim(x)[1] R <- penalty$numSim sim <- vector("numeric", length = R) - #lasso.x.y <- rlasso(y ~ x) - #eh <- lasso.x.y$residuals + # lasso.x.y <- rlasso(y ~ x) + # eh <- lasso.x.y$residuals eh <- y ehat <- matrix(rep(eh, each = p), ncol = p, byrow = TRUE) # might be improved by initial estimator or passed through # for (l in 1:R) { @@ -490,23 +506,23 @@ lambdaCalculation <- function(penalty = list(homoscedastic = FALSE, X.dependent. # psi <- apply(xehat, 2, function(x) mean(x^2)) # sim[l] <- n * max(2 * abs(colMeans(t(t(xehat)/sqrt(psi)) * g))) # } - xehat <- x*ehat + xehat <- x * ehat psi <- apply(xehat, 2, function(x) mean(x^2)) - tXehattpsi <- t(t(xehat)/sqrt(psi)) - - for (l in 1:R) { - g <- matrix(rep(rnorm(n), each = p), ncol = p, byrow = TRUE) - #sim[l] <- n * max(2 * colMeans(x * ehat* g)) - sim[l] <- n * max(2 * abs(colMeans(tXehattpsi * g))) - } - - + tXehattpsi <- t(t(xehat) / sqrt(psi)) + + for (l in 1:R) { + g <- matrix(rep(rnorm(n), each = p), ncol = p, byrow = TRUE) + # sim[l] <- n * max(2 * colMeans(x * ehat* g)) + sim[l] <- n * max(2 * abs(colMeans(tXehattpsi * g))) + } + + lambda0 <- penalty$c * quantile(sim, probs = 1 - penalty$gamma) - Ups0 <- 1/sqrt(n) * sqrt(t(t(y^2) %*% (x^2))) + Ups0 <- 1 / sqrt(n) * sqrt(t(t(y^2) %*% (x^2))) lambda <- lambda0 * Ups0 } - - + + if (!is.null(penalty$lambda.start)) { p <- dim(x)[2] if (length(penalty$lambda.start) == 1) { @@ -516,11 +532,12 @@ lambdaCalculation <- function(penalty = list(homoscedastic = FALSE, X.dependent. } if (penalty$homoscedastic == "none") { - if (is.null(penalty$lambda.start) | !exists("lambda.start", where = penalty)) + if (is.null(penalty$lambda.start) | !exists("lambda.start", where = penalty)) { stop("For method \"none\" lambda.start must be provided") + } n <- dim(x)[1] lambda0 <- penalty$lambda.start - Ups0 <- 1/sqrt(n) * sqrt(t(t(y^2) %*% (x^2))) + Ups0 <- 1 / sqrt(n) * sqrt(t(t(y^2) %*% (x^2))) lambda <- lambda0 * Ups0 } @@ -551,24 +568,32 @@ lambdaCalculation <- function(penalty = list(homoscedastic = FALSE, X.dependent. #' @aliases methods.rlasso print.rlasso predict.rlasso model.matrix.rlasso #' @export -print.rlasso <- function(x, all=TRUE ,digits = max(3L, getOption("digits") - 3L), ...) { +print.rlasso <- function(x, all = TRUE, digits = max(3L, getOption("digits") - 3L), ...) { cat("\nCall:\n", paste(deparse(x$call), sep = "\n", collapse = "\n"), "\n\n", sep = "") if (length(coef(x))) { if (all) { cat("Coefficients:\n") - print.default(format(coef(x), digits = digits), print.gap = 2L, - quote = FALSE) + print.default(format(coef(x), digits = digits), + print.gap = 2L, + quote = FALSE + ) } else { if (x$options$intercept) { - print.default(format(coef(x)[c(TRUE,x$index)], digits = digits), print.gap = 2L, - quote = FALSE) + print.default(format(coef(x)[c(TRUE, x$index)], digits = digits), + print.gap = 2L, + quote = FALSE + ) } else { - print.default(format(x$beta[x$index], digits = digits), print.gap = 2L, - quote = FALSE) + print.default(format(x$beta[x$index], digits = digits), + print.gap = 2L, + quote = FALSE + ) } } } - else cat("No coefficients\n") + else { + cat("No coefficients\n") + } cat("\n") invisible(x) } @@ -576,15 +601,15 @@ print.rlasso <- function(x, all=TRUE ,digits = max(3L, getOption("digits") - 3L) #' @rdname methods.rlasso #' @export -summary.rlasso <- function(object, all=TRUE, digits = max(3L, getOption("digits") - 3L), ...) { +summary.rlasso <- function(object, all = TRUE, digits = max(3L, getOption("digits") - 3L), ...) { cat("\nCall:\n", paste(deparse(object$call), sep = "\n", collapse = "\n"), "\n", sep = "") - cat("\nPost-Lasso Estimation: ", paste(deparse(object$options$post), sep = "\n", collapse = "\n"), "\n", sep = " ") + cat("\nPost-Lasso Estimation: ", paste(deparse(object$options$post), sep = "\n", collapse = "\n"), "\n", sep = " ") coefs <- object$coefficients p <- length(object$beta) - num.selected <- sum(abs(object$beta)>0) + num.selected <- sum(abs(object$beta) > 0) n <- length(object$residuals) cat("\nTotal number of variables:", p) - cat("\nNumber of selected variables:", num.selected, "\n", sep=" ") + cat("\nNumber of selected variables:", num.selected, "\n", sep = " ") resid <- object$residuals cat("\nResiduals: \n") nam <- c("Min", "1Q", "Median", "3Q", "Max") @@ -593,46 +618,48 @@ summary.rlasso <- function(object, all=TRUE, digits = max(3L, getOption("digits" cat("\n") if (all) { coefm <- matrix(NA, length(coefs), 1) - coefm[,1] <- coefs + coefm[, 1] <- coefs colnames(coefm) <- "Estimate" rownames(coefm) <- names(coefs) printCoefmat(coefm, digits = digits, na.print = "NA") } else { - coefs <- coefs[abs(coefs)>0] + coefs <- coefs[abs(coefs) > 0] coefm <- matrix(NA, length(coefs), 1) - coefm[,1] <- coefs + coefm[, 1] <- coefs colnames(coefm) <- "Estimate" rownames(coefm) <- names(coefs) printCoefmat(coefm, digits = digits, na.print = "NA") } cat("\nResidual standard error:", format(signif(object$sigma, digits))) cat("\n") - + if (object$options$intercept) { df.int <- 1 } else { df.int <- 0 } - - object$r.squared <- 1 - object$rss/object$tss - object$adj.r.squared <- 1 - (1-object$r.squared)*((n-df.int)/(n-num.selected-df.int)) + + object$r.squared <- 1 - object$rss / object$tss + object$adj.r.squared <- 1 - (1 - object$r.squared) * ((n - df.int) / (n - num.selected - df.int)) cat("Multiple R-squared: ", formatC(object$r.squared, digits = digits)) cat("\nAdjusted R-squared: ", formatC(object$adj.r.squared, digits = digits)) - + if (!is.null(object$model)) { - object$supscore <- sqrt(n)*max(abs(colMeans(object$model*as.vector(object$dev)))) + object$supscore <- sqrt(n) * max(abs(colMeans(object$model * as.vector(object$dev)))) R <- 500 - stat <- vector("numeric", length=R) + stat <- vector("numeric", length = R) for (i in 1:R) { g <- rnorm(n) - dev.g <- as.vector(g*object$dev) - mat <- object$model*dev.g - stat[i] <- sqrt(n)*max(abs(colMeans(mat))) + dev.g <- as.vector(g * object$dev) + mat <- object$model * dev.g + stat[i] <- sqrt(n) * max(abs(colMeans(mat))) } - #quantstat <- quantile(stat, probs=c(1-alpha)) - object$pvalue <- sum(stat>object$supscore)/R - cat("\nJoint significance test:\n", "the sup score statistic for joint significance test is", formatC(object$supscore, digits = digits), "with a p-value of", - formatC(object$pvalue, digits = digits)) + # quantstat <- quantile(stat, probs=c(1-alpha)) + object$pvalue <- sum(stat > object$supscore) / R + cat( + "\nJoint significance test:\n", "the sup score statistic for joint significance test is", formatC(object$supscore, digits = digits), "with a p-value of", + formatC(object$pvalue, digits = digits) + ) } cat("\n") invisible(object) @@ -657,13 +684,12 @@ summary.rlasso <- function(object, all=TRUE, digits = max(3L, getOption("digits" # } # return(mm) # } -model.matrix.rlasso <- function (object, ...) -{ - if(is.null(object$model)){ - if (!is.null(object$call$x)){ +model.matrix.rlasso <- function(object, ...) { + if (is.null(object$model)) { + if (!is.null(object$call$x)) { mm <- as.matrix(eval(object$call$x)) } else { - if(!is.null(object$call$data)){ + if (!is.null(object$call$data)) { mm <- model.matrix(eval(object$call$formula), data = eval(object$call$data))[, -1, drop = FALSE] } else { mm <- model.matrix(eval(object$call$formula))[, -1, drop = FALSE] @@ -713,7 +739,7 @@ model.matrix.rlasso <- function (object, ...) # } # n <- dim(X)[1] #length(object$residuals) # beta <- object$beta -# +# # if (object$options[["intercept"]]) { # yhat <- X %*% beta + object$intercept # if (dim(X)[2]==0) yhat <- rep(object$intercept, n) @@ -725,7 +751,7 @@ model.matrix.rlasso <- function (object, ...) # return(yhat) # } -# predict.rlasso2 <- function (object, newdata = NULL, ...) +# predict.rlasso2 <- function (object, newdata = NULL, ...) # { # if (missing(newdata) || is.null(newdata)) { # X <- model.matrix(object) @@ -747,7 +773,7 @@ model.matrix.rlasso <- function (object, ...) # stop("newdata does not contain the variables specified in formula") # } # } -# } +# } # if (sum(object$options$ind.scale) != 0) { # X <- X[, -object$options$ind.scale] # } @@ -756,23 +782,22 @@ model.matrix.rlasso <- function (object, ...) # beta <- object$beta # if (object$options[["intercept"]]) { # yhat <- X %*% beta + object$intercept -# if (dim(X)[2] == 0) +# if (dim(X)[2] == 0) # yhat <- rep(object$intercept, n) # } # if (!object$options[["intercept"]]) { # yhat <- X %*% beta -# if (dim(X)[2] == 0) +# if (dim(X)[2] == 0) # yhat <- rep(0, n) # } # return(yhat) # } -predict.rlasso <- function (object, newdata = NULL, ...) -{ +predict.rlasso <- function(object, newdata = NULL, ...) { mf <- match.call(expand.dots = TRUE) m <- match("newx", names(mf), 0L) - if (m!=0L) stop("Please use argument \"newdata\" instead of \"newx\" to provide data for prediction.") + if (m != 0L) stop("Please use argument \"newdata\" instead of \"newx\" to provide data for prediction.") k <- length(object$beta) if (missing(newdata) || is.null(newdata)) { X <- model.matrix(object) @@ -785,49 +810,39 @@ predict.rlasso <- function (object, newdata = NULL, ...) if (dim(X)[2] != k) { stop("No variable names provided in newdata and number of parameters does not fit!") } else { - #message("No variable names provided in newdata. Prediction relies on right ordering of the variables.") + # message("No variable names provided in newdata. Prediction relies on right ordering of the variables.") } } else { varcoef <- names(object$beta) - if (all(is.element(varcoef, colnames(newdata)))){ + if (all(is.element(varcoef, colnames(newdata)))) { X <- as.matrix(newdata[, varcoef]) } else { mod.frame <- as.data.frame(cbind(rep(1, nrow(newdata)), newdata)) colnames(mod.frame)[1] <- as.character(eval(object$call$formula)[[2]]) X <- try(model.matrix(eval(object$call$formula), data = mod.frame)[, -1, drop = FALSE]) - if(inherits(X, "try-error")){ + if (inherits(X, "try-error")) { stop("newdata does not contain the variables specified in formula") - } + } } } } - if (sum(object$options$ind.scale) != 0) { - X <- X[, -object$options$ind.scale] - } - + if (sum(object$options$ind.scale) != 0) { + X <- X[, -object$options$ind.scale] + } + n <- dim(X)[1] beta <- object$beta if (object$options[["intercept"]]) { yhat <- X %*% beta + object$intercept - if (dim(X)[2] == 0) + if (dim(X)[2] == 0) { yhat <- rep(object$intercept, n) + } } if (!object$options[["intercept"]]) { yhat <- X %*% beta - if (dim(X)[2] == 0) + if (dim(X)[2] == 0) { yhat <- rep(0, n) + } } return(yhat) } - - - - - - - - - - - - diff --git a/R/rlassoAutoDML.R b/R/rlassoAutoDML.R new file mode 100644 index 0000000..cdaee2a --- /dev/null +++ b/R/rlassoAutoDML.R @@ -0,0 +1,118 @@ +#' Auto DML based on rlasso +#' +#' Implements the Double ML algorithm introduced in for estimating causal effects +#' This method was first introdued in https://arxiv.org/abs/1608.00060 which required +#' manual calculation of the Riesz representer. In Auto DML (from https://arxiv.org/abs/1809.05224) +#' the Riesz representer is estimated automatically and doesn't need to be explicitly +#' computed. +#' +#' This implementation +#' +#' @param Y A vector of outputs +#' @param D A vector of treatment values +#' @param X A matrix of covariates +#' @param dict A dictionary +#' function of (d,z) that maps to a vector +#' default is (1,d,z) +#' @param bias debiased vs. biased results +#' @param D_LB Lower bound on D (default 0) +#' @param D_add value added to D (default 0.2) +#' @param L number of folds data is split into (default 5) +#' @param max_iter maximum iterations of Lasso (default 10) +#' @return list with average treatment effect and standard error +#' @examples +#' # data = simulate_data(500) +#' # +#' # Y = data[[1]] +#' # D = data[[2]] +#' # X = data[[3]] +#' +#' # rlassoAutoDML(Y,D,X,dict = b2) +#' # rlassoAutoDML(Y, T, X) +#' @export +#' @rdname rlassoDML +rlassoAutoDML <- function(Y, D, X, dict = NULL, D_LB = 0, D_add = 0.2, + bias = FALSE, L = 5, max_iter = 10) { + + if (is.null(dict)) { + dict = function(d,z){ + return(c(1,d,z)) + } + + } + p <- length(dict(D[1], X[1, ])) + + # p0=dim(X0) used in low-dim dictionary in the stage 1 tuning procedure + # TODO: is this comment true or is p0 = dim(X0)/4 used or can we drop this? + p0 <- ceiling(p / 4) + if (p > 60) { + p0 <- ceiling(p / 40) + + } + n <- nrow(X) + folds <- split(sample(n, n, replace = FALSE), as.factor(1:L)) + + Psi_tilde <- numeric(0) + + for (l in 1:L) { + Y.l <- Y[folds[[l]]] + Y.nl <- Y[-folds[[l]]] + + T.l <- D[folds[[l]]] + T.nl <- D[-folds[[l]]] + + X.l <- X[folds[[l]], ] + X.nl <- X[-folds[[l]], ] + + + n.l <- length(T.l) + n.nl <- length(T.nl) + + # get stage 1 (on nl) + rho_hat <- RMD_stable(Y.nl, T.nl, X.nl, p0, D_LB, D_add, max_iter, dict) + + alpha_hat <- function(d, z) { + return(dict(d, z) %*% rho_hat) + } + + n <- nrow(X.nl) + p <- length(dict(T.nl[1], X.nl[1, ])) + # Apply the dictionary b to W + B <- matrix(0, n, p) + for (i in 1:n) { + B[i, ] <- dict(T.nl[i], X.nl[i, ]) + } + + gamma_coeff <- rlasso(B, Y.nl, intercept = F)$coefficients + gamma_hat <- function(d, z) { + return(dict(d, z) %*% gamma_coeff) + } + + print(paste0("fold: ", l)) + + # get stage 2 (on l) + # psi_star + Psi_tilde.l <- rep(0, n.l) + for (i in 1:n.l) { + if (bias) { # plug-in + Psi_tilde.l[i] <- psi_tilde_bias(Y.l[i], T.l[i], X.l[i, ], m, alpha_hat, gamma_hat) # without subtracting theta_hat + } else { # DML + Psi_tilde.l[i] <- psi_tilde(Y.l[i], T.l[i], X.l[i, ], m, alpha_hat, gamma_hat) # without subtracting theta_hat + } + } + Psi_tilde <- c(Psi_tilde, Psi_tilde.l) + } + + # point estimation + ate <- mean(Psi_tilde) + + # influences + Psi <- Psi_tilde - ate + + var <- mean(Psi^2) + se <- sqrt(var / n) + + out <- c(table(D)[[2]], table(D)[[1]], ate, se) + + return(out) +} diff --git a/R/rlassoAutoDML_helpers.R b/R/rlassoAutoDML_helpers.R new file mode 100644 index 0000000..279f160 --- /dev/null +++ b/R/rlassoAutoDML_helpers.R @@ -0,0 +1,208 @@ +library("mvtnorm") + +two.norm <- function(x) { + return(sqrt(x %*% x)) +} + +m <- function(y, d, z, gamma) { # all data arguments to make interchangeable with m2 + return(gamma(1, z) - gamma(0, z)) +} + +m2 <- function(y, d, z, gamma) { + return(y * gamma(d, z)) +} + +psi_tilde <- function(y, d, z, m, alpha, gamma) { + return(m(y, d, z, gamma) + alpha(d, z) * (y - gamma(d, z))) +} + +psi_tilde_bias <- function(y, d, z, m, alpha, gamma) { + return(m(y, d, z, gamma)) +} + +get_MNG <- function(Y, D, X, b) { + p <- length(b(D[1], X[1, ])) + n.nl <- length(D) + + B <- matrix(0, n.nl, p) + M <- matrix(0, p, n.nl) + N <- matrix(0, p, n.nl) + + for (i in 1:n.nl) { + B[i, ] <- b(D[i], X[i, ]) + M[, i] <- m(Y[i], D[i], X[i, ], b) + N[, i] <- m2(Y[i], D[i], X[i, ], b) # this is a more general formulation for N + } + + M_hat <- rowMeans(M) + N_hat <- rowMeans(N) + G_hat <- t(B) %*% B / n.nl + + return(list(M_hat, N_hat, G_hat, B)) +} + + +get_D <- function(Y, D, X, m, rho_hat, b) { + n <- nrow(X) + p <- length(b(D[1], X[1, ])) + + df <- matrix(0, p, n) + for (i in 1:n) { + df[, i] <- b(D[i], X[i, ]) * as.vector(rho_hat %*% b(D[i], X[i, ])) - m(Y[i], D[i], X[i, ], b) + } + df <- df^2 + D2 <- rowMeans(df) + + D <- sqrt(D2) + return(D) # pass around D as vector +} + +#' RMD_stable +#' +#' TODO: insert description on what this function does and document all +#' parameters +#' +#' TODO: set defaults for arguments +#' @param Y A vector of outputs +#' @param D A vector of treatment values +#' @param X A matrix of covariates +#' @param p0 initial value of p +#' @param D_LB Lower bound on D (default 0) +#' @param D_add value added to D (default 0.2) +#' @param max_iter maximum iterations of Lasso (default 10) +#' @param b A dictionary +#' function of (d,z) that maps to a vector +#' default is (1,d,z) +#' @param c parameter to tune lambda (default 0.5) +#' @param alpha parameter to tune lambda (default 0.1) +#' @param tol minimum improvement to continue looping (default 1e-6) +#' +#' @export +RMD_stable <- function(Y, D, X, p0, D_LB = 0, D_add 0,2, max_iter = 10, b = NULL, c = 0.5, alpha = 0.1, tol = 1e-6) { + + + if (is.null(b)) { + b = function(d,z){ + return(c(1,d,z)) + } + + } + k <- 1 + l <- 0.1 + + p <- length(b(D[1], X[1, ])) + n <- length(D) + + # low-dimensional moments + X0 <- X[, 1:p0] + MNG0 <- get_MNG(Y, D, X0, b) + M_hat0 <- MNG0[[1]] + N_hat0 <- MNG0[[2]] + G_hat0 <- MNG0[[3]] + + # initial estimate + rho_hat0 <- solve(G_hat0, M_hat0) + rho_hat <- c(rho_hat0, rep(0, p - ncol(G_hat0))) + beta_hat0 <- solve(G_hat0, N_hat0) + beta_hat <- c(beta_hat0, rep(0, p - ncol(G_hat0))) + + # moments + MNG <- get_MNG(Y, D, X, b) + M_hat <- MNG[[1]] + N_hat <- MNG[[2]] + G_hat <- MNG[[3]] + + # penalty + lambda <- c * qnorm(1 - alpha / (2 * p)) / sqrt(n) # snippet + + ########### + # alpha_hat + ########### + diff_rho <- 1 + + while (diff_rho > tol & k <= max_iter) { + # previous values + rho_hat_old <- rho_hat + 0 + + # normalization + D_hat_rho <- get_D(Y, D, X, m, rho_hat_old, b) + D_hat_rho <- pmax(D_LB, D_hat_rho) + D_hat_rho <- D_hat_rho + D_add + + L <- c(l, rep(1, p - 1)) # dictionary is ordered (constant,...) + lambda_vec <- lambda * L * D_hat_rho # v3: insert D here + rho_hat <- LassoShooting.fit(G_hat, M_hat, lambda_vec, XX = -G_hat / 2, Xy = -M_hat / 2, beta.start = rep(0, p))$coefficients + # difference + diff_rho <- two.norm(rho_hat - rho_hat_old) + k <- k + 1 + + } + + return(rho_hat) +} + +#' prints output of rlassoAutoDML in an easy to read format +#' @param spec1 output of rlassoAutoDML +#' +#' @export +#' @rdname printer +printer <- function(spec1) { + print(paste(" treated: ", spec1[1], " untreated: ", spec1[2], " ATE: ", round(spec1[3], 2), " SE: ", round(spec1[4], 2), sep = "")) +} + +#' prints output of rlassoAutoDML in latex table format +#' @param spec1 output of rlassoAutoDML +#' +#' @export +#' @rdname for_tex +for_tex <- function(spec1) { + print(paste(" & ", spec1[1], " & ", spec1[2], " & ", round(spec1[3], 2), " & ", round(spec1[4], 2), sep = "")) +} + + +b2<-function(d,z){ + return(c(1,d,z,d*z)) +} + +simulate_data = function(n,method = 3,rank = 5){ + ###designed to return list(Y,T,X) with ATE 2.2 + ###Inputs + #n: dimensions of data Y:length n vec T:length n vec, X: n x n + ###Output: list(Y,T,X) + + X = matrix(0,n,100) + Y = c() + T = c() + beta = c() + for (i in 1:100){ + beta = c(beta,1/(i^2)) + } + + sigma = diag(100) + for (i in 1:99){ + sigma[i,i+1] = 0.5 + sigma[i+1,i] = 0.5 + + + for (i in 1:n){ + X_i = rmvnorm(1,rep(0,100),sigma) + v = rnorm(1,0,1) + eps = rnorm(1,0,1) + if (3*X_i%*%beta+0.75*v>=0){ + T_i = 1 + }else{ + T_i = 0 + } + Y = c(Y,1.2*T_i+1.2*X_i%*%beta+T_i^2+T_i*X_i[1]+eps) + T = c(T,T_i) + X[i,] = X_i + } + + return(list(Y,T,X)) + + + + } +} + + diff --git a/R/rlassoEffects.R b/R/rlassoEffects.R index 1081336..e139820 100644 --- a/R/rlassoEffects.R +++ b/R/rlassoEffects.R @@ -10,7 +10,7 @@ #' the control variables. The final estimation is done by a regression of the #' outcome on the treatment effect and the union of the selected variables in #' the first two steps. In partialling-out first the effect of the regressors on the outcome and the treatment variable is taken out by Lasso and then a regression of the residuals is conducted. The resulting estimator for \eqn{\alpha_0} is normal -#' distributed which allows inference on the treatment effect. It presents a wrap function for \code{rlassoEffect} +#' distributed which allows inference on the treatment effect. It presents a wrap function for \code{rlassoEffect} #' which does inference for a single variable. #' #' @param x matrix of regressor variables serving as controls and potential @@ -18,7 +18,7 @@ #' @param y outcome variable (vector or matrix) #' @param index vector of integers, logicals or variables names indicating the position (column) of #' variables (integer case), logical vector of length of the variables (TRUE or FALSE) or the variable names of \code{x} which should be used for inference / as treatment variables. -#' @param method method for inference, either 'partialling out' (default) or 'double selection'. +#' @param method method for inference, either 'partialling out' (default) or 'double selection'. #' @param I3 For the 'double selection'-method the logical vector \code{I3} has same length as the number of variables in \code{x}; #' indicates if variables (TRUE) should be included in any case to the model and they are exempt from selection. These variables should not be included in the \code{index}; hence the intersection with \code{index} must be the empty set. #' In the case of partialling out it is ignored. @@ -33,49 +33,51 @@ #' @export #' @rdname rlassoEffects #' @examples -#' library(hdm); library(ggplot2) +#' library(hdm) +#' library(ggplot2) #' set.seed(1) -#' n = 100 #sample size -#' p = 100 # number of variables -#' s = 3 # number of non-zero variables -#' X = matrix(rnorm(n*p), ncol=p) -#' colnames(X) <- paste("X", 1:p, sep="") -#' beta = c(rep(3,s), rep(0,p-s)) -#' y = 1 + X%*%beta + rnorm(n) -#' data = data.frame(cbind(y,X)) +#' n <- 100 # sample size +#' p <- 100 # number of variables +#' s <- 3 # number of non-zero variables +#' X <- matrix(rnorm(n * p), ncol = p) +#' colnames(X) <- paste("X", 1:p, sep = "") +#' beta <- c(rep(3, s), rep(0, p - s)) +#' y <- 1 + X %*% beta + rnorm(n) +#' data <- data.frame(cbind(y, X)) #' colnames(data)[1] <- "y" -#' fm = paste("y ~", paste(colnames(X), collapse="+")) -#' fm = as.formula(fm) -#' lasso.effect = rlassoEffects(X, y, index=c(1,2,3,50)) -#' lasso.effect = rlassoEffects(fm, I = ~ X1 + X2 + X3 + X50, data=data) +#' fm <- paste("y ~", paste(colnames(X), collapse = "+")) +#' fm <- as.formula(fm) +#' lasso.effect <- rlassoEffects(X, y, index = c(1, 2, 3, 50)) +#' lasso.effect <- rlassoEffects(fm, I = ~ X1 + X2 + X3 + X50, data = data) #' print(lasso.effect) #' summary(lasso.effect) #' confint(lasso.effect) #' plot(lasso.effect) -# library(hdm) -# ## DGP -# n <- 250 -# p <- 100 -# px <- 10 -# X <- matrix(rnorm(n*p), ncol=p) -# beta <- c(rep(2,px), rep(0,p-px)) -# intercept <- 1 -# y <- intercept + X %*% beta + rnorm(n) -# ## fit rlassoEffects object with inference on three variables -# rlassoEffects.reg <- rlassoEffects(x=X, y=y, index=c(1,7,20)) -# ## methods -# summary(rlassoEffects.reg) -# confint(rlassoEffects.reg, level=0.9) -rlassoEffects <- function(x, ...) - UseMethod("rlassoEffects") # definition generic function +#' # library(hdm) +#' # ## DGP +#' # n <- 250 +#' # p <- 100 +#' # px <- 10 +#' # X <- matrix(rnorm(n*p), ncol=p) +#' # beta <- c(rep(2,px), rep(0,p-px)) +#' # intercept <- 1 +#' # y <- intercept + X %*% beta + rnorm(n) +#' # ## fit rlassoEffects object with inference on three variables +#' # rlassoEffects.reg <- rlassoEffects(x=X, y=y, index=c(1,7,20)) +#' # ## methods +#' # summary(rlassoEffects.reg) +#' # confint(rlassoEffects.reg, level=0.9) +rlassoEffects <- function(x, ...) { + UseMethod("rlassoEffects") +} # definition generic function #' @export #' @rdname rlassoEffects -rlassoEffects.default <- function(x, y, index = c(1:ncol(x)), method = "partialling out", +rlassoEffects.default <- function(x, y, index = c(1:ncol(x)), method = "partialling out", I3 = NULL, post = TRUE, ...) { - + checkmate::checkChoice(method, c("partialling out", "double selection")) - + if (is.logical(index)) { k <- p1 <- sum(index) } else { @@ -105,12 +107,14 @@ rlassoEffects.default <- function(x, y, index = c(1:ncol(x)), method = "partiall if (method == "double selection") { # check validity of I3 I3ind <- which(I3 == T) - if (length(intersect(index, I3ind) != 0)) + if (length(intersect(index, I3ind) != 0)) { stop("I3 and index must not overlap!") + } } - - if (is.null(colnames(x))) + + if (is.null(colnames(x))) { colnames(x) <- paste("V", 1:dim(x)[2], sep = "") + } coefficients <- as.vector(rep(NA_real_, k)) se <- rep(NA_real_, k) t <- rep(NA_real_, k) @@ -126,8 +130,10 @@ rlassoEffects.default <- function(x, y, index = c(1:ncol(x)), method = "partiall d <- x[, index[i], drop = FALSE] Xt <- x[, -index[i], drop = FALSE] I3m <- I3[-index[i]] - lasso.regs[[i]] <- try(col <- rlassoEffect(Xt, y, d, method = method, - I3 = I3m, post = post, ...), silent = TRUE) + lasso.regs[[i]] <- try(col <- rlassoEffect(Xt, y, d, + method = method, + I3 = I3m, post = post, ... + ), silent = TRUE) if (class(lasso.regs[[i]]) == "try-error") { next } else { @@ -138,14 +144,16 @@ rlassoEffects.default <- function(x, y, index = c(1:ncol(x)), method = "partiall reside[, i] <- col$residuals$epsilon residv[, i] <- col$residuals$v coef.mat[[i]] <- col$coefficients.reg - selection.matrix[-index[i],i] <- col$selection.index + selection.matrix[-index[i], i] <- col$selection.index } } - #colnames(coef.mat) <- colnames(x)[index] + # colnames(coef.mat) <- colnames(x)[index] residuals <- list(e = reside, v = residv) - res <- list(coefficients = coefficients, se = se, t = t, pval = pval, - lasso.regs = lasso.regs, index = index, call = match.call(), samplesize = n, - residuals = residuals, coef.mat = coef.mat, selection.matrix = selection.matrix) + res <- list( + coefficients = coefficients, se = se, t = t, pval = pval, + lasso.regs = lasso.regs, index = index, call = match.call(), samplesize = n, + residuals = residuals, coef.mat = coef.mat, selection.matrix = selection.matrix + ) class(res) <- "rlassoEffects" return(res) } @@ -154,13 +162,13 @@ rlassoEffects.default <- function(x, y, index = c(1:ncol(x)), method = "partiall #' @param formula An element of class \code{formula} specifying the linear model. #' @param I An one-sided formula specifying the variables for which inference is conducted. #' @param included One-sided formula of variables which should be included in any case (only for method="double selection"). -#' @param data an optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. +#' @param data an optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. #' If not found in data, the variables are taken from environment(formula), typically the environment from which the function is called. #' @export -rlassoEffects.formula <- function(formula, data, I, method = "partialling out", +rlassoEffects.formula <- function(formula, data, I, method = "partialling out", included = NULL, post = TRUE, ...) { cl <- match.call() - if (missing(data)) data <- environment(formula) + if (missing(data)) data <- environment(formula) mf <- match.call(expand.dots = FALSE) m <- match(c("formula", "data"), names(mf), 0L) mf <- mf[c(1L, m)] @@ -171,20 +179,22 @@ rlassoEffects.formula <- function(formula, data, I, method = "partialling out", attr(mt, "intercept") <- 1 y <- model.response(mf, "numeric") n <- length(y) - x <- model.matrix(mt, mf)[,-1, drop=FALSE] + x <- model.matrix(mt, mf)[, -1, drop = FALSE] cn <- attr(mt, "term.labels") - try(if (is.matrix(eval(parse(text=cn)))) cn <- colnames(eval(parse(text=cn))), silent=TRUE) + try(if (is.matrix(eval(parse(text = cn)))) cn <- colnames(eval(parse(text = cn))), silent = TRUE) I.c <- check_variables(I, cn) - #I.c <- grep(cn[I.c],colnames(X)) + # I.c <- grep(cn[I.c],colnames(X)) I.c <- which(colnames(x) %in% cn[I.c]) I3 <- check_variables(included, cn) - #I3 <- grep(cn[I.c],colnames(X)) + # I3 <- grep(cn[I.c],colnames(X)) I3 <- which(colnames(x) %in% cn[I.c]) - #if (length(intersect(I.c, I3) != 0)) + # if (length(intersect(I.c, I3) != 0)) # stop("I and included should not contain the same variables!") - - est <- rlassoEffects(x, y, index = I.c, method = method, - I3 = I3, post = post, ...) + + est <- rlassoEffects(x, y, + index = I.c, method = method, + I3 = I3, post = post, ... + ) est$call <- cl return(est) } @@ -192,28 +202,30 @@ rlassoEffects.formula <- function(formula, data, I, method = "partialling out", #' @rdname rlassoEffects #' @param d variable for which inference is conducted (treatment variable) #' @export -rlassoEffect <- function(x, y, d, method = "double selection", I3 = NULL, +rlassoEffect <- function(x, y, d, method = "double selection", I3 = NULL, post = TRUE, ...) { d <- as.matrix(d, ncol = 1) y <- as.matrix(y, ncol = 1) kx <- dim(x)[2] n <- dim(x)[1] - if (is.null(colnames(d))) + if (is.null(colnames(d))) { colnames(d) <- "d1" - if (is.null(colnames(x)) & !is.null(x)) + } + if (is.null(colnames(x)) & !is.null(x)) { colnames(x) <- paste("x", 1:kx, sep = "") + } if (method == "double selection") { I1 <- rlasso(d ~ x, post = post, ...)$index I2 <- rlasso(y ~ x, post = post, ...)$index - - + + if (is.logical(I3)) { I <- I1 + I2 + I3 I <- as.logical(I) } else { I <- I1 + I2 I <- as.logical(I) - names(I) <- union(names(I1),names(I2)) + names(I) <- union(names(I1), names(I2)) } if (sum(I) == 0) { I <- NULL @@ -222,7 +234,7 @@ rlassoEffect <- function(x, y, d, method = "double selection", I3 = NULL, reg1 <- lm(y ~ x) alpha <- coef(reg1)[2] names(alpha) <- colnames(d) - xi <- reg1$residuals * sqrt(n/(n - sum(I) - 1)) + xi <- reg1$residuals * sqrt(n / (n - sum(I) - 1)) if (is.null(I)) { reg2 <- lm(d ~ 1) } @@ -230,9 +242,9 @@ rlassoEffect <- function(x, y, d, method = "double selection", I3 = NULL, reg2 <- lm(d ~ x[, -1, drop = FALSE]) } v <- reg2$residuals - var <- 1/n * 1/mean(v^2) * mean(v^2 * xi^2) * 1/mean(v^2) + var <- 1 / n * 1 / mean(v^2) * mean(v^2 * xi^2) * 1 / mean(v^2) se <- sqrt(var) - tval <- alpha/sqrt(var) + tval <- alpha / sqrt(var) pval <- 2 * pnorm(-abs(tval)) if (is.null(I)) { no.selected <- 1 @@ -247,12 +259,14 @@ rlassoEffect <- function(x, y, d, method = "double selection", I3 = NULL, # samplesize=n) se <- drop(se) names(se) <- colnames(d) - results <- list(alpha = alpha, se = se, t = tval, pval = pval, - no.selected = no.selected, coefficients = alpha, coefficient = alpha, - coefficients.reg = coef(reg1), selection.index = I, residuals = res, call = match.call(), - samplesize = n) + results <- list( + alpha = alpha, se = se, t = tval, pval = pval, + no.selected = no.selected, coefficients = alpha, coefficient = alpha, + coefficients.reg = coef(reg1), selection.index = I, residuals = res, call = match.call(), + samplesize = n + ) } - + if (method == "partialling out") { reg1 <- rlasso(y ~ x, post = post, ...) yr <- reg1$residuals @@ -262,17 +276,19 @@ rlassoEffect <- function(x, y, d, method = "double selection", I3 = NULL, alpha <- coef(reg3)[2] var <- vcov(reg3)[2, 2] se <- sqrt(var) - tval <- alpha/sqrt(var) + tval <- alpha / sqrt(var) pval <- 2 * pnorm(-abs(tval)) res <- list(epsilon = reg3$residuals, v = dr) I1 <- reg1$index I2 <- reg2$index I <- as.logical(I1 + I2) - names(I) <- union(names(I1),names(I2)) - results <- list(alpha = unname(alpha), se = drop(se), t = unname(tval), - pval = unname(pval), coefficients = unname(alpha), coefficient = unname(alpha), - coefficients.reg = coef(reg1), selection.index = I, residuals = res, call = match.call(), - samplesize = n) + names(I) <- union(names(I1), names(I2)) + results <- list( + alpha = unname(alpha), se = drop(se), t = unname(tval), + pval = unname(pval), coefficients = unname(alpha), coefficient = unname(alpha), + coefficients.reg = coef(reg1), selection.index = I, residuals = res, call = match.call(), + samplesize = n + ) } class(results) <- "rlassoEffects" return(results) @@ -295,15 +311,21 @@ rlassoEffect <- function(x, y, d, method = "double selection", I3 = NULL, #' @aliases methods.rlassoEffects print.rlassoEffects confint.rlassoEffects plot.rlassoEffects #' @export -print.rlassoEffects <- function(x, digits = max(3L, getOption("digits") - - 3L), ...) { - cat("\nCall:\n", paste(deparse(x$call), sep = "\n", collapse = "\n"), - "\n\n", sep = "") +print.rlassoEffects <- function(x, digits = max(3L, getOption("digits") - + 3L), ...) { + cat("\nCall:\n", paste(deparse(x$call), sep = "\n", collapse = "\n"), + "\n\n", + sep = "" + ) if (length(coef(x))) { cat("Coefficients:\n") - print.default(format(coef(x), digits = digits), print.gap = 2L, - quote = FALSE) - } else cat("No coefficients\n") + print.default(format(coef(x), digits = digits), + print.gap = 2L, + quote = FALSE + ) + } else { + cat("No coefficients\n") + } cat("\n") invisible(coef(x)) } @@ -315,15 +337,15 @@ print.rlassoEffects <- function(x, digits = max(3L, getOption("digits") - #' @param joint logical, if \code{TRUE} joint confidence intervals are calculated. #' @export -# confint.rlassoEffects <- function(object, parm, level = 0.95, joint = FALSE, +# confint.rlassoEffects <- function(object, parm, level = 0.95, joint = FALSE, # ...) { # B <- 500 # number of bootstrap repitions # n <- object$samplesize # k <- p1 <- length(object$coefficients) # cf <- coef(object) # pnames <- names(cf) -# if (missing(parm)) -# parm <- pnames else if (is.numeric(parm)) +# if (missing(parm)) +# parm <- pnames else if (is.numeric(parm)) # parm <- pnames[parm] # if (!joint) { # a <- (1 - level)/2 @@ -331,12 +353,12 @@ print.rlassoEffects <- function(x, digits = max(3L, getOption("digits") - # # fac <- qt(a, n-k) # fac <- qnorm(a) # pct <- format.perc(a, 3) -# ci <- array(NA, dim = c(length(parm), 2L), dimnames = list(parm, +# ci <- array(NA, dim = c(length(parm), 2L), dimnames = list(parm, # pct)) # ses <- object$se[parm] # ci[] <- cf[parm] + ses %o% fac # } -# +# # if (joint) { # phi <- object$residuals$e * object$residuals$v # m <- 1/sqrt(colMeans(phi^2)) @@ -352,7 +374,7 @@ print.rlassoEffects <- function(x, digits = max(3L, getOption("digits") - # a <- (1 - level)/2 # ab <- c(a, 1 - a) # pct <- format.perc(ab, 3) -# ci <- array(NA, dim = c(length(parm), 2L), dimnames = list(parm, +# ci <- array(NA, dim = c(length(parm), 2L), dimnames = list(parm, # pct)) # hatc <- quantile(sim, probs = 1 - a) # ci[, 1] <- cf[parm] - hatc * 1/sqrt(n) * sigma @@ -362,51 +384,57 @@ print.rlassoEffects <- function(x, digits = max(3L, getOption("digits") - # } -confint.rlassoEffects <- function(object, parm, level = 0.95, joint = FALSE, +confint.rlassoEffects <- function(object, parm, level = 0.95, joint = FALSE, ...) { - B <- 500 # number of bootstrap repitions + B <- 500 # number of bootstrap repitions n <- object$samplesize k <- p1 <- length(object$coefficients) cf <- coef(object) pnames <- names(cf) - if (missing(parm)) - parm <- pnames else if (is.numeric(parm)) - parm <- pnames[parm] + if (missing(parm)) { + parm <- pnames + } else if (is.numeric(parm)) { + parm <- pnames[parm] + } if (!joint) { - a <- (1 - level)/2 + a <- (1 - level) / 2 a <- c(a, 1 - a) # fac <- qt(a, n-k) fac <- qnorm(a) pct <- format.perc(a, 3) - ci <- array(NA, dim = c(length(parm), 2L), dimnames = list(parm, - pct)) + ci <- array(NA, dim = c(length(parm), 2L), dimnames = list( + parm, + pct + )) ses <- object$se[parm] ci[] <- cf[parm] + ses %o% fac } - + if (joint) { e <- object$residuals$e v <- object$residuals$v - ev <- e*v + ev <- e * v Ev2 <- colMeans(v^2) - Omegahat <- matrix(NA, ncol=k, nrow=k) + Omegahat <- matrix(NA, ncol = k, nrow = k) for (j in 1:k) { for (l in 1:k) { - Omegahat[j,l] = Omegahat[l,j] = 1/(Ev2[j]*Ev2[l]) * mean(ev[,j]*ev[,l]) + Omegahat[j, l] <- Omegahat[l, j] <- 1 / (Ev2[j] * Ev2[l]) * mean(ev[, j] * ev[, l]) } } var <- diag(Omegahat) names(var) <- names(cf) sim <- vector("numeric", length = B) for (i in 1:B) { - beta_i <- MASS::mvrnorm(mu = rep(0,k), Sigma=Omegahat/n) - sim[i] <- max(abs(beta_i/sqrt(var))) + beta_i <- MASS::mvrnorm(mu = rep(0, k), Sigma = Omegahat / n) + sim[i] <- max(abs(beta_i / sqrt(var))) } - a <- (1 - level) #not dividing by 2! - ab <- c(a/2, 1 - a/2) + a <- (1 - level) # not dividing by 2! + ab <- c(a / 2, 1 - a / 2) pct <- format.perc(ab, 3) - ci <- array(NA, dim = c(length(parm), 2L), dimnames = list(parm, - pct)) + ci <- array(NA, dim = c(length(parm), 2L), dimnames = list( + parm, + pct + )) hatc <- quantile(sim, probs = 1 - a) ci[, 1] <- cf[parm] - hatc * sqrt(var[parm]) ci[, 2] <- cf[parm] + hatc * sqrt(var[parm]) @@ -421,23 +449,23 @@ confint.rlassoEffects <- function(object, parm, level = 0.95, joint = FALSE, #' @param xlab a title for the x axis #' @param ylab a title for the y axis #' @param xlim vector of length two giving lower and upper bound of x axis -plot.rlassoEffects <- function(x, joint=FALSE, level= 0.95, main = "", xlab = "coef", ylab = "", +plot.rlassoEffects <- function(x, joint = FALSE, level = 0.95, main = "", xlab = "coef", ylab = "", xlim = NULL, ...) { - + # generate ordered KI-matrix - coefmatrix <- cbind(summary(x)$coef, confint(x, joint = joint, level=level))[, c(1, 5, 6)] + coefmatrix <- cbind(summary(x)$coef, confint(x, joint = joint, level = level))[, c(1, 5, 6)] if (is.null(dim(coefmatrix))) { vec <- coefmatrix coefmatrix <- matrix(vec, ncol = 3) colnames(coefmatrix) <- names(vec) } - + rownames(coefmatrix) <- names(x$coefficients) coefmatrix <- as.data.frame(coefmatrix) coefmatrix <- cbind(rownames(coefmatrix), coefmatrix) colnames(coefmatrix) <- c("names", "coef", "lower", "upper") coefmatrix <- coefmatrix[order(abs(coefmatrix[, 2])), ] - + col <- "#000099" # scale if (missing(xlim)) { @@ -448,24 +476,30 @@ plot.rlassoEffects <- function(x, joint=FALSE, level= 0.95, main = "", xlab = "c up <- xlim[2] } # generate points - plotobject <- ggplot2::ggplot(coefmatrix, ggplot2::aes(y = coef, x = factor(names, - levels = names))) + ggplot2::geom_point(colour = col) + - ggplot2::geom_hline(colour = col, ggplot2::aes(width = 0.1, h = 0, yintercept=0)) - + plotobject <- ggplot2::ggplot(coefmatrix, ggplot2::aes(y = coef, x = factor(names, + levels = names + ))) + + ggplot2::geom_point(colour = col) + + ggplot2::geom_hline(colour = col, ggplot2::aes(width = 0.1, h = 0, yintercept = 0)) + # generate errorbars (KIs) - plotobject <- plotobject + ggplot2::geom_errorbar(ymin = coefmatrix$lower, - ymax = coefmatrix$upper, colour = col) - + plotobject <- plotobject + ggplot2::geom_errorbar( + ymin = coefmatrix$lower, + ymax = coefmatrix$upper, colour = col + ) + # further graphic parameter - plotobject <- plotobject + ggplot2::ggtitle(main) + ggplot2::ylim(low, - up) + ggplot2::xlab(ylab) + ggplot2::ylab(xlab) - - + plotobject <- plotobject + ggplot2::ggtitle(main) + ggplot2::ylim( + low, + up + ) + ggplot2::xlab(ylab) + ggplot2::ylab(xlab) + + ## invert x and y axis - #plotobject <- plotobject + ggplot2::coord_flip() - + # plotobject <- plotobject + ggplot2::coord_flip() + # layout - plotobject <- plotobject + ggplot2::theme_bw() + ggplot2::geom_blank() + + plotobject <- plotobject + ggplot2::theme_bw() + ggplot2::geom_blank() + ggplot2::theme(panel.grid.major.x = ggplot2::element_blank(), panel.grid.minor.x = ggplot2::element_blank()) plotobject <- plotobject + ggplot2::scale_x_discrete(labels = abbreviate) # plot @@ -476,15 +510,15 @@ plot.rlassoEffects <- function(x, joint=FALSE, level= 0.95, main = "", xlab = "c ################ Methods: summary #' Summarizing rlassoEffects fits -#' +#' #' Summary method for class \code{rlassoEffects} -#' +#' #' Summary of objects of class \code{rlassoEffects} -#' +#' #' @param object an object of class \code{rlassoEffects}, usually a result of a call to \code{rlassoEffects} #' @param ... further arguments passed to or from other methods. #' @rdname summary.rlassoEffects -#' @export +#' @export summary.rlassoEffects <- function(object, ...) { ans <- NULL k <- length(object$coefficients) @@ -507,8 +541,8 @@ summary.rlassoEffects <- function(object, ...) { #' @method print summary.rlassoEffects #' @rdname summary.rlassoEffects #' @export -print.summary.rlassoEffects <- function(x, digits = max(3L, getOption("digits") - - 3L), ...) { +print.summary.rlassoEffects <- function(x, digits = max(3L, getOption("digits") - + 3L), ...) { if (length(coef(x$object))) { k <- dim(x$coefficients)[1] table <- x$coefficients @@ -526,57 +560,59 @@ print.summary.rlassoEffects <- function(x, digits = max(3L, getOption("digits") #' Coefficients from S3 objects \code{rlassoEffects} #' #' Method to extract coefficients from objects of class \code{rlassoEffects} -#' +#' #' Printing coefficients and selection matrix for S3 object \code{rlassoEffects}. Interpretation of entries in the selection matrix #' \itemize{ #' \item \code{"-"} indicates a target variable, #' \item \code{"x"} indicates that a variable has been selected with rlassoEffects (coefficient is different from zero), #' \item \code{"."} indicates that a variable has been de-selected with rlassoEffects (coefficient is zero). #' } -#' +#' #' @param object an object of class \code{rlassoEffects}, usually a result of a call \code{rlassoEffect} or \code{rlassoEffects}. -#' @param selection.matrix if TRUE, a selection matrix is returned that indicates the selected variables from each auxiliary regression. -#' Default is set to FALSE. -#' @param include.targets if FALSE (by default) only the selected control variables are listed in the \code{selection.matrix}. If set to TRUE, -#' the selection matrix will also indicate the selection of the target coefficients that are specified in the \code{rlassoEffects} call. +#' @param selection.matrix if TRUE, a selection matrix is returned that indicates the selected variables from each auxiliary regression. +#' Default is set to FALSE. +#' @param include.targets if FALSE (by default) only the selected control variables are listed in the \code{selection.matrix}. If set to TRUE, +#' the selection matrix will also indicate the selection of the target coefficients that are specified in the \code{rlassoEffects} call. #' @param complete general option of the function \code{coef}. -#' @param ... further arguments passed to functions coef or print. +#' @param ... further arguments passed to functions coef or print. #' @export #' @rdname coef.rlassoEffects #' @examples #' library(hdm) #' set.seed(1) -#' n = 100 #sample size -#' p = 100 # number of variables -#' s = 7 # number of non-zero variables -#' X = matrix(rnorm(n*p), ncol=p) -#' colnames(X) <- paste("X", 1:p, sep="") -#' beta = c(rep(3,s), rep(0,p-s)) -#' y = 1 + X%*%beta + rnorm(n) -#' data = data.frame(cbind(y,X)) +#' n <- 100 # sample size +#' p <- 100 # number of variables +#' s <- 7 # number of non-zero variables +#' X <- matrix(rnorm(n * p), ncol = p) +#' colnames(X) <- paste("X", 1:p, sep = "") +#' beta <- c(rep(3, s), rep(0, p - s)) +#' y <- 1 + X %*% beta + rnorm(n) +#' data <- data.frame(cbind(y, X)) #' colnames(data)[1] <- "y" -#' lasso.effect = rlassoEffects(X, y, index=c(1,2,3,50), -#' method = "double selection") +#' lasso.effect <- rlassoEffects(X, y, +#' index = c(1, 2, 3, 50), +#' method = "double selection" +#' ) #' coef(lasso.effect) # standard use of coef() - without selection matrix #' # with selection matrix #' coef(lasso.effect, selection.matrix = TRUE) #' # prettier output with print_coef (identical options as coef()) -#' print_coef(lasso.effect, selection.matrix = TRUE) +#' print_coef(lasso.effect, selection.matrix = TRUE) coef.rlassoEffects <- function(object, complete = TRUE, selection.matrix = FALSE, include.targets = FALSE, ...) { - + cf <- object$coefficients - + if (selection.matrix == TRUE) { - + mat <- object$selection.matrix - + if (is.null(mat)) { mat <- cbind(object$selection.index) dmat2 <- dim(mat)[2] rnames <- rownames(mat) targetindx <- stats::complete.cases(mat) } - + else { dmat2 <- dim(mat)[2] rnames <- rownames(mat) @@ -584,100 +620,101 @@ coef.rlassoEffects <- function(object, complete = TRUE, selection.matrix = FALSE mat <- cbind(mat, as.logical(apply(mat, 1, sum))) colnames(mat)[dim(mat)[2]] <- "global" } - + if (include.targets == FALSE) { mat <- mat[targetindx, , drop = FALSE] rnames <- rownames(mat) } - - else{ + + else { mat <- rbind(mat[targetindx == FALSE, , drop = FALSE], mat[targetindx, , drop = FALSE]) rnames <- rownames(mat) } - + mat <- rbind(mat, apply(mat, 2, sum, na.rm = TRUE)) mat <- apply(mat, 2, function(x) gsub(1, "x", x)) mat <- apply(mat, 2, function(x) gsub(0, ".", x)) mat[is.na(mat)] <- "-" rownames(mat) <- c(rnames, "sum") - + if (complete) { - + coef <- list(cf = cf, selection.matrix = mat) return(coef) } - + else { coef <- list(cf = cf[!is.na(cf)], selection.matrix = mat) return(coef) - } + } } - + else { - if (complete) { return(cf) } - + else { return(cf[!is.na(cf)]) - } + } } - + } #' Printing coefficients from S3 objects \code{rlassoEffects} #' #' Printing coefficients for class \code{rlassoEffects} -#' +#' #' Printing coefficients and selection matrix for S3 object \code{rlassoEffects} -#' +#' #' @param x an object of class \code{rlassoEffects}, usually a result of a call \code{rlassoEffect} or \code{rlassoEffects}. -#' @param selection.matrix if TRUE, a selection matrix is returned that indicates the selected variables from each auxiliary regression. -#' Default is set to FALSE. -#' @param include.targets if FALSE (by default) only the selected control variables are listed in the \code{selection.matrix}. If set to TRUE, -#' the selection matrix will also indicate the selection of the target coefficients that are specified in the \code{rlassoEffects} call. +#' @param selection.matrix if TRUE, a selection matrix is returned that indicates the selected variables from each auxiliary regression. +#' Default is set to FALSE. +#' @param include.targets if FALSE (by default) only the selected control variables are listed in the \code{selection.matrix}. If set to TRUE, +#' the selection matrix will also indicate the selection of the target coefficients that are specified in the \code{rlassoEffects} call. #' @param complete general option of the function \code{coef}. -#' @param ... further arguments passed to functions coef or print. +#' @param ... further arguments passed to functions coef or print. #' @rdname print_coef #' @aliases print_coef.rlassoEffects #' @export #' @examples #' library(hdm) #' set.seed(1) -#' n = 100 #sample size -#' p = 100 # number of variables -#' s = 7 # number of non-zero variables -#' X = matrix(rnorm(n*p), ncol=p) -#' colnames(X) <- paste("X", 1:p, sep="") -#' beta = c(rep(3,s), rep(0,p-s)) -#' y = 1 + X%*%beta + rnorm(n) -#' data = data.frame(cbind(y,X)) +#' n <- 100 # sample size +#' p <- 100 # number of variables +#' s <- 7 # number of non-zero variables +#' X <- matrix(rnorm(n * p), ncol = p) +#' colnames(X) <- paste("X", 1:p, sep = "") +#' beta <- c(rep(3, s), rep(0, p - s)) +#' y <- 1 + X %*% beta + rnorm(n) +#' data <- data.frame(cbind(y, X)) #' colnames(data)[1] <- "y" -#' lasso.effect = rlassoEffects(X, y, index=c(1,2,3,50), -#' method = "double selection") +#' lasso.effect <- rlassoEffects(X, y, +#' index = c(1, 2, 3, 50), +#' method = "double selection" +#' ) #' # without target coefficient estimates -#' print_coef(lasso.effect, selection.matrix = TRUE) +#' print_coef(lasso.effect, selection.matrix = TRUE) #' # with target coefficient estimates -#' print_coef(lasso.effect, selection.matrix = TRUE, targets = TRUE) -print_coef <- function(x, ...){ +#' print_coef(lasso.effect, selection.matrix = TRUE, targets = TRUE) +print_coef <- function(x, ...) { UseMethod("print_coef") } -#' @rdname print_coef +#' @rdname print_coef #' @export -print_coef.rlassoEffects <- function(x, complete = TRUE, selection.matrix = FALSE, include.targets = TRUE, ...) { +print_coef.rlassoEffects <- function(x, complete = TRUE, selection.matrix = FALSE, include.targets = TRUE, ...) { checkmate::check_class(x, "rlassoEffects") - + if (selection.matrix == FALSE) { cat("\n") print("Estimated target coefficients") print(coef(x), complete = complete, ...) cat("\n") } - + else { sel.mat <- coef(x, selection.matrix = selection.matrix, include.targets = include.targets, complete = complete, ...) cat("\n") @@ -689,7 +726,7 @@ print_coef.rlassoEffects <- function(x, complete = TRUE, selection.matrix = FALS print(sel.mat$selection.matrix) cat("_ _ _ \n") print("'-' indicates a target variable; ") - print("'x' indicates that a variable has been selected with rlassoEffects (coefficient is different from zero);") + print("'x' indicates that a variable has been selected with rlassoEffects (coefficient is different from zero);") print("'.' indicates that a variable has been de-selected with rlassoEffects (coefficient is zero).") } -} \ No newline at end of file +} diff --git a/R/rlassoIV.R b/R/rlassoIV.R index 8637454..9c62d8c 100644 --- a/R/rlassoIV.R +++ b/R/rlassoIV.R @@ -7,8 +7,8 @@ #' The implementation for selection on x and z follows the procedure described in Chernozhukov et al. #' (2015) and is built on 'triple selection' to achieve an orthogonal moment #' function. The function returns an object of S3 class \code{rlassoIV}. -#' Moreover, it is wrap function for the case that selection should be done only with the instruments Z (\code{rlassoIVselectZ}) or with -#' the control variables X (\code{rlassoIVselectX}) or without selection (\code{tsls}). Exogenous variables +#' Moreover, it is wrap function for the case that selection should be done only with the instruments Z (\code{rlassoIVselectZ}) or with +#' the control variables X (\code{rlassoIVselectX}) or without selection (\code{tsls}). Exogenous variables #' \code{x} are automatically used as instruments and added to the #' instrument set \code{z}. #' @@ -30,50 +30,54 @@ #' @rdname rlassoIV #' @export #' @examples -#'\dontrun{ +#' \dontrun{ #' data(EminentDomain) #' z <- EminentDomain$logGDP$z # instruments #' x <- EminentDomain$logGDP$x # exogenous variables #' y <- EminentDomain$logGDP$y # outcome varialbe #' d <- EminentDomain$logGDP$d # treatment / endogenous variable -#' lasso.IV.Z = rlassoIV(x=x, d=d, y=y, z=z, select.X=FALSE, select.Z=TRUE) +#' lasso.IV.Z <- rlassoIV(x = x, d = d, y = y, z = z, select.X = FALSE, select.Z = TRUE) #' summary(lasso.IV.Z) #' confint(lasso.IV.Z) #' } -rlassoIV <- function(x, ...) - UseMethod("rlassoIV") # definition generic function +rlassoIV <- function(x, ...) { + UseMethod("rlassoIV") +} # definition generic function #' @rdname rlassoIV #' @export -rlassoIV.default <- function(x, d, y, z, select.Z = TRUE, select.X = TRUE, post = TRUE, - ...) { +rlassoIV.default <- function(x, d, y, z, select.Z = TRUE, select.X = TRUE, post = TRUE, + ...) { d <- as.matrix(d) z <- as.matrix(z) - if (is.null(colnames(d))) + if (is.null(colnames(d))) { colnames(d) <- paste("d", 1:ncol(d), sep = "") - if (is.null(colnames(x)) & !is.null(x)) + } + if (is.null(colnames(x)) & !is.null(x)) { colnames(x) <- paste("x", 1:ncol(x), sep = "") - if (is.null(colnames(z)) & !is.null(z)) + } + if (is.null(colnames(z)) & !is.null(z)) { colnames(z) <- paste("z", 1:ncol(z), sep = "") + } n <- length(y) - + if (select.Z == FALSE && select.X == FALSE) { res <- tsls(x, d, y, z, homoscedastic = FALSE, ...) return(res) } - + if (select.Z == TRUE && select.X == FALSE) { res <- rlassoIVselectZ(x, d, y, z, post = post, ...) return(res) } - + if (select.Z == FALSE && select.X == TRUE) { res <- rlassoIVselectX(x, d, y, z, post = post, ...) return(res) } - + if (select.Z == TRUE && select.X == TRUE) { - + Z <- cbind(z, x) lasso.d.zx <- rlasso(Z, d, post = post, ...) lasso.y.x <- rlasso(x, y, post = post, ...) @@ -85,43 +89,45 @@ rlassoIV.default <- function(x, d, y, z, select.Z = TRUE, select.X = TRUE, post selection.matrixZ <- matrix(NA, ncol = dim(d)[2], nrow = dim(Z)[2]) rownames(selection.matrixZ) <- colnames(Z) colnames(selection.matrixZ) <- colnames(d) - selection.matrixZ[,1] <- ind.dzx <- lasso.d.zx$index - - #PZ <- Z[, ind.dzx] %*% MASS::ginv(t(Z[, ind.dzx]) %*% Z[, ind.dzx]) %*% + selection.matrixZ[, 1] <- ind.dzx <- lasso.d.zx$index + + # PZ <- Z[, ind.dzx] %*% MASS::ginv(t(Z[, ind.dzx]) %*% Z[, ind.dzx]) %*% # t(Z[, ind.dzx]) %*% d PZ <- as.matrix(predict(lasso.d.zx)) lasso.PZ.x <- rlasso(x, PZ, post = post, ...) - + selection.matrix <- matrix(NA, ncol = (1 + dim(d)[2]), nrow = dim(x)[2]) rownames(selection.matrix) <- colnames(x) colnames(selection.matrix) <- c("y", colnames(d)) - selection.matrix[ , 1] <- lasso.y.x$index - selection.matrix[ , 2] <- ind.PZx <- lasso.PZ.x$index - + selection.matrix[, 1] <- lasso.y.x$index + selection.matrix[, 2] <- ind.PZx <- lasso.PZ.x$index + if (sum(ind.PZx) == 0) { Dr <- d - mean(d) } else { - Dr <- d - predict(lasso.PZ.x) #x[,ind.PZx]%*%MASS::ginv(t(x[,ind.PZx])%*%x[,ind.PZx])%*%t(x[,ind.PZx])%*%PZ + Dr <- d - predict(lasso.PZ.x) # x[,ind.PZx]%*%MASS::ginv(t(x[,ind.PZx])%*%x[,ind.PZx])%*%t(x[,ind.PZx])%*%PZ } - + if (sum(lasso.y.x$index) == 0) { Yr <- y - mean(y) } else { Yr <- lasso.y.x$residuals } - + if (sum(lasso.PZ.x$index) == 0) { Zr <- PZ - mean(x) } else { Zr <- lasso.PZ.x$residuals } - + result <- tsls(y = Yr, d = Dr, x = NULL, z = Zr, intercept = FALSE, homoscedastic = FALSE) coef <- as.vector(result$coefficient) se <- diag(sqrt(result$vcov)) names(coef) <- names(se) <- colnames(d) - res <- list(coefficients = coef, se = se, vcov = vcov, call = match.call(), - samplesize = n, selection.matrixZ = selection.matrixZ, selection.matrix = selection.matrix) + res <- list( + coefficients = coef, se = se, vcov = vcov, call = match.call(), + samplesize = n, selection.matrixZ = selection.matrixZ, selection.matrix = selection.matrix + ) class(res) <- "rlassoIV" return(res) } @@ -130,25 +136,27 @@ rlassoIV.default <- function(x, d, y, z, select.Z = TRUE, select.X = TRUE, post #' @param formula An object of class \code{Formula} of the form " y ~ x + d | x + z" with y the outcome variable, #' d endogenous variable, z instrumental variables, and x exogenous variables. -#' @param data an optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. +#' @param data an optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. #' If not found in data, the variables are taken from environment(formula), typically the environment from which \code{rlassoIV} is called. #' @rdname rlassoIV #' @export -rlassoIV.formula <- function(formula, data, select.Z = TRUE, select.X = TRUE, post = TRUE, - ...) { - +rlassoIV.formula <- function(formula, data, select.Z = TRUE, select.X = TRUE, post = TRUE, + ...) { + mat <- f.formula(formula, data, all.categories = FALSE) - + y <- mat$Y x <- mat$X d <- mat$D z <- mat$Z - - res <- rlassoIV(x=x, d=d, y=y, z=z, select.Z = select.Z, select.X = select.X, post = post, - ...) + + res <- rlassoIV( + x = x, d = d, y = y, z = z, select.Z = select.Z, select.X = select.X, post = post, + ... + ) res$call <- match.call() return(res) - + } @@ -156,7 +164,7 @@ rlassoIV.formula <- function(formula, data, select.Z = TRUE, select.X = TRUE, po #' Methods for S3 object \code{rlassoIV} #' -#' Objects of class \code{rlassoIV} are constructed by \code{rlassoIV}. +#' Objects of class \code{rlassoIV} are constructed by \code{rlassoIV}. #' \code{print.rlassoIV} prints and displays some information about fitted \code{rlassoIV} objects. #' \code{summary.rlassoIV} summarizes information of a fitted \code{rlassoIV} object. #' \code{confint.rlassoIV} extracts the confidence intervals. @@ -170,15 +178,21 @@ rlassoIV.formula <- function(formula, data, select.Z = TRUE, select.X = TRUE, po #' @aliases methods.rlassoIV print.rlassoIV summary.rlassoIV #' @export -print.rlassoIV <- function(x, digits = max(3L, getOption("digits") - 3L), +print.rlassoIV <- function(x, digits = max(3L, getOption("digits") - 3L), ...) { - cat("\nCall:\n", paste(deparse(x$call), sep = "\n", collapse = "\n"), - "\n\n", sep = "") + cat("\nCall:\n", paste(deparse(x$call), sep = "\n", collapse = "\n"), + "\n\n", + sep = "" + ) if (length(coef(x))) { cat("Coefficients:\n") - print.default(format(coef(x), digits = digits), print.gap = 2L, - quote = FALSE) - } else cat("No coefficients\n") + print.default(format(coef(x), digits = digits), + print.gap = 2L, + quote = FALSE + ) + } else { + cat("No coefficients\n") + } cat("\n") invisible(coef(x)) } @@ -186,8 +200,8 @@ print.rlassoIV <- function(x, digits = max(3L, getOption("digits") - 3L), #' @rdname methods.rlassoIV #' @export -summary.rlassoIV <- function(object, digits = max(3L, getOption("digits") - - 3L), ...) { +summary.rlassoIV <- function(object, digits = max(3L, getOption("digits") - + 3L), ...) { if (length(coef(object))) { k <- length(object$coefficient) table <- matrix(NA, ncol = 4, nrow = k) @@ -195,10 +209,12 @@ summary.rlassoIV <- function(object, digits = max(3L, getOption("digits") - colnames(table) <- c("coeff.", "se.", "t-value", "p-value") table[, 1] <- object$coefficients table[, 2] <- object$se - table[, 3] <- table[, 1]/table[, 2] + table[, 3] <- table[, 1] / table[, 2] table[, 4] <- 2 * pnorm(-abs(table[, 3])) - cat("Estimates and Significance Testing of the effect of target variables in the IV regression model", - "\n") + cat( + "Estimates and Significance Testing of the effect of target variables in the IV regression model", + "\n" + ) printCoefmat(table, digits = digits, P.values = TRUE, has.Pvalue = TRUE) cat("\n") } else { @@ -216,10 +232,12 @@ confint.rlassoIV <- function(object, parm, level = 0.95, ...) { k <- length(object$coefficients) cf <- coef(object) pnames <- names(cf) - if (missing(parm)) - parm <- pnames else if (is.numeric(parm)) - parm <- pnames[parm] - a <- (1 - level)/2 + if (missing(parm)) { + parm <- pnames + } else if (is.numeric(parm)) { + parm <- pnames[parm] + } + a <- (1 - level) / 2 a <- c(a, 1 - a) # fac <- qt(a, n-k) fac <- qnorm(a) @@ -234,84 +252,88 @@ confint.rlassoIV <- function(object, parm, level = 0.95, ...) { #' Coefficients from S3 objects \code{rlassoIV} #' #' Method to extract coefficients from objects of class \code{rlassoIV}. -#' +#' #' Printing coefficients and selection matrix for S3 object \code{rlassoIV}. \code{"x"} indicates that a variable has been selected, i.e., the corresponding estimated coefficient is different from zero. -#' The very last column collects all variables that have been selected in at least one of the lasso regressions represented in the \code{selection.matrix}. +#' The very last column collects all variables that have been selected in at least one of the lasso regressions represented in the \code{selection.matrix}. #' \code{rlassoIV} performs three lasso regression steps. A first stage lasso regression of the endogenous treatment variable \code{d} on the instruments \code{z} and exogenous covariates \code{x}, -#' a lasso regression of \code{y} on the exogenous variables \code{x}, and a lasso regression of the instrumented treatment variable, i.e., a regression of the predicted values of \code{d}, on controls \code{x}. +#' a lasso regression of \code{y} on the exogenous variables \code{x}, and a lasso regression of the instrumented treatment variable, i.e., a regression of the predicted values of \code{d}, on controls \code{x}. #' #' @param object an object of class \code{rlassoIV}, usually a result of a call \code{rlassoIV} with options \code{select.X=TRUE} and \code{select.Z=TRUE}. #' @param selection.matrix if TRUE, a selection matrix is returned that indicates the selected variables from each first stage regression. #' Default is set to FALSE. See section on details for more information. #' @param complete general option of the function \code{coef}. #' @param ... further arguments passed to function coef. -#' @return Coefficients obtained from \code{rlassoIV} by default. If option \code{selection.matrix} is \code{TRUE}, a list is returned with final coefficients, a matrix \code{selection.matrix}, and a matrix \code{selection.matrixZ}: +#' @return Coefficients obtained from \code{rlassoIV} by default. If option \code{selection.matrix} is \code{TRUE}, a list is returned with final coefficients, a matrix \code{selection.matrix}, and a matrix \code{selection.matrixZ}: #' \code{selection.matrix} contains the selection index for the lasso regression of \code{y} on \code{x} (first column) and the lasso regression of the predicted values of \code{d} on \code{x} #' together with the union of these indizes. -#' \code{selection.matrixZ} contains the selection index from the first-stage lasso regression of \code{d} on \code{z} and \code{x}. +#' \code{selection.matrixZ} contains the selection index from the first-stage lasso regression of \code{d} on \code{z} and \code{x}. #' @export #' @rdname coef.rlassoIV -#' @examples +#' @examples #' \dontrun{ #' data(EminentDomain) #' z <- EminentDomain$logGDP$z # instruments #' x <- EminentDomain$logGDP$x # exogenous variables #' y <- EminentDomain$logGDP$y # outcome varialbe #' d <- EminentDomain$logGDP$d # treatment / endogenous variable -#' lasso.IV = rlassoIV(x=x, d=d, y=y, z=z, select.X=TRUE, select.Z=TRUE) +#' lasso.IV <- rlassoIV(x = x, d = d, y = y, z = z, select.X = TRUE, select.Z = TRUE) #' coef(lasso.IV) # default behavior #' coef(lasso.IV, selection.matrix = T) # print selection matrix #' } -coef.rlassoIV <- function(object, complete = TRUE, selection.matrix = FALSE, ...) { - +coef.rlassoIV <- function(object, complete = TRUE, selection.matrix = FALSE, ...) { + cf <- object$coefficients - + if (selection.matrix == TRUE) { - - mat <- object$selection.matrix - dmat2 <- dim(mat)[2] - rnames <- rownames(mat) - mat <- cbind(mat, as.logical(apply(mat, 1, sum))) - colnames(mat)[dim(mat)[2]] <- "global" - mat <- rbind(mat, apply(mat, 2, sum, na.rm = TRUE)) - mat <- apply(mat, 2, function(x) gsub(1, "x", x)) - mat <- apply(mat, 2, function(x) gsub(0, ".", x)) - # mat[is.na(mat)] <- "-" - rownames(mat) <- c(rnames, "sum") - - # selection w.r.t. - matZ <- object$selection.matrixZ - dmatZ2 <- dim(matZ)[2] - Zrnames <- rownames(matZ) - matZ <- cbind(matZ, as.logical(apply(matZ, 1, sum))) - colnames(matZ)[dim(matZ)[2]] <- "global.Z" - matZ <- rbind(matZ, apply(matZ, 2, sum, na.rm = TRUE)) - matZ <- apply(matZ, 2, function(x) gsub(1, "x", x)) - matZ <- apply(matZ, 2, function(x) gsub(0, ".", x)) - # mat[is.na(matZ)] <- "-" - rownames(matZ) <- c(Zrnames, "sum") - - if (complete) { - coef <- list(cf = cf, selection.matrix = mat, - selection.matrixZ = matZ) - return(coef) - } - - else { - coef <- list(cf = cf[!is.na(cf)], selection.matrix = mat, - selection.matrixZ = matZ) - return(coef) - } + + mat <- object$selection.matrix + dmat2 <- dim(mat)[2] + rnames <- rownames(mat) + mat <- cbind(mat, as.logical(apply(mat, 1, sum))) + colnames(mat)[dim(mat)[2]] <- "global" + mat <- rbind(mat, apply(mat, 2, sum, na.rm = TRUE)) + mat <- apply(mat, 2, function(x) gsub(1, "x", x)) + mat <- apply(mat, 2, function(x) gsub(0, ".", x)) + # mat[is.na(mat)] <- "-" + rownames(mat) <- c(rnames, "sum") + + # selection w.r.t. + matZ <- object$selection.matrixZ + dmatZ2 <- dim(matZ)[2] + Zrnames <- rownames(matZ) + matZ <- cbind(matZ, as.logical(apply(matZ, 1, sum))) + colnames(matZ)[dim(matZ)[2]] <- "global.Z" + matZ <- rbind(matZ, apply(matZ, 2, sum, na.rm = TRUE)) + matZ <- apply(matZ, 2, function(x) gsub(1, "x", x)) + matZ <- apply(matZ, 2, function(x) gsub(0, ".", x)) + # mat[is.na(matZ)] <- "-" + rownames(matZ) <- c(Zrnames, "sum") + + if (complete) { + coef <- list( + cf = cf, selection.matrix = mat, + selection.matrixZ = matZ + ) + return(coef) + } + + else { + coef <- list( + cf = cf[!is.na(cf)], selection.matrix = mat, + selection.matrixZ = matZ + ) + return(coef) + } } - - else { + + else { if (complete) { return(cf) } - + else { return(cf[!is.na(cf)]) - } + } } } @@ -320,44 +342,48 @@ coef.rlassoIV <- function(object, complete = TRUE, selection.matrix = FALSE, .. #' @rdname rlassoIV #' @export -rlassoIVmult <- function(x, d, y, z, select.Z = TRUE, select.X = TRUE, +rlassoIVmult <- function(x, d, y, z, select.Z = TRUE, select.X = TRUE, ...) { # browser() d <- as.matrix(d) - if (is.null(colnames(d))) + if (is.null(colnames(d))) { colnames(d) <- paste("d", 1:ncol(d), sep = "") - if (is.null(colnames(x)) & !is.null(x)) + } + if (is.null(colnames(x)) & !is.null(x)) { colnames(x) <- paste("x", 1:ncol(x), sep = "") - if (is.null(colnames(z)) & !is.null(z)) + } + if (is.null(colnames(z)) & !is.null(z)) { colnames(z) <- paste("z", 1:ncol(z), sep = "") - + } + if (select.Z == FALSE & select.X == FALSE) { - res <- tsls(x=x, d=d, y=y, z=z, homoscedastic = FALSE, ...) + res <- tsls(x = x, d = d, y = y, z = z, homoscedastic = FALSE, ...) return(res) } - + if (select.Z == TRUE & select.X == FALSE) { res <- rlassoIVselectZ(x, d, y, z, ...) return(res) } - + if (select.Z == FALSE & select.X == TRUE) { res <- rlassoIVselectX(x, d, y, z, ...) return(res) } - + if (select.Z == TRUE & select.X == TRUE) { d <- as.matrix(d) n <- dim(x)[1] d <- as.matrix(d) kd <- dim(d)[2] Z <- cbind(z, x) - if (is.null(colnames(d))) + if (is.null(colnames(d))) { colnames(d) <- paste("d", 1:kd, sep = "") - + } + lasso.y.x <- rlasso(x, y, ...) Yr <- lasso.y.x$residuals - + Drhat <- NULL Zrhat <- NULL selection.matrix <- matrix(NA, ncol = (1 + dim(d)[2]), nrow = dim(x)[2]) @@ -366,9 +392,9 @@ rlassoIVmult <- function(x, d, y, z, select.Z = TRUE, select.X = TRUE, selection.matrixZ <- matrix(NA, ncol = dim(d)[2], nrow = dim(Z)[2]) rownames(selection.matrixZ) <- colnames(Z) colnames(selection.matrixZ) <- colnames(d) - - selection.matrix[,1] <- lasso.y.x$index - + + selection.matrix[, 1] <- lasso.y.x$index + for (i in 1:kd) { lasso.d.x <- rlasso(d[, i] ~ x, ...) lasso.d.zx <- rlasso(d[, i] ~ Z, ...) @@ -376,17 +402,22 @@ rlassoIVmult <- function(x, d, y, z, select.Z = TRUE, select.X = TRUE, Drhat <- cbind(Drhat, d[, i] - mean(d[, i])) Zrhat <- cbind(Zrhat, d[, i] - mean(d[, i])) colnames(Drhat)[i] <- colnames(d)[i] - selection.matrix[,i+1] <- selection.matrixZ[, i] <- FALSE + selection.matrix[, i + 1] <- selection.matrixZ[, i] <- FALSE next } selection.matrixZ[, i] <- ind.dzx <- lasso.d.zx$index - PZ <- Z[, ind.dzx, drop = FALSE] %*% MASS::ginv(t(Z[, ind.dzx, - drop = FALSE]) %*% Z[, ind.dzx, drop = FALSE]) %*% t(Z[, - ind.dzx, drop = FALSE]) %*% d[, i, drop = FALSE] + PZ <- Z[, ind.dzx, drop = FALSE] %*% MASS::ginv(t(Z[, ind.dzx, + drop = FALSE + ]) %*% Z[, ind.dzx, drop = FALSE]) %*% t(Z[, + ind.dzx, + drop = FALSE + ]) %*% d[, i, drop = FALSE] lasso.PZ.x <- rlasso(PZ ~ x, ...) - selection.matrix[, i+1] <- ind.PZx <- lasso.PZ.x$index - Dr <- d[, i] - x[, ind.PZx, drop = FALSE] %*% MASS::ginv(t(x[, - ind.PZx, drop = FALSE]) %*% x[, ind.PZx, drop = FALSE]) %*% + selection.matrix[, i + 1] <- ind.PZx <- lasso.PZ.x$index + Dr <- d[, i] - x[, ind.PZx, drop = FALSE] %*% MASS::ginv(t(x[, + ind.PZx, + drop = FALSE + ]) %*% x[, ind.PZx, drop = FALSE]) %*% t(x[, ind.PZx, drop = FALSE]) %*% PZ Zr <- lasso.PZ.x$residuals Drhat <- cbind(Drhat, Dr) @@ -396,9 +427,11 @@ rlassoIVmult <- function(x, d, y, z, select.Z = TRUE, select.X = TRUE, coef <- as.vector(result$coefficient) se <- sqrt(diag(result$vcov)) names(coef) <- names(se) <- colnames(d) - res <- list(coefficients = coef, se = se, vcov = result$vcov, call = match.call(), - samplesize = n, selection.matrixZ = selection.matrixZ, selection.matrix = selection.matrix) + res <- list( + coefficients = coef, se = se, vcov = result$vcov, call = match.call(), + samplesize = n, selection.matrixZ = selection.matrixZ, selection.matrix = selection.matrix + ) class(res) <- "rlassoIV" return(res) } -} \ No newline at end of file +} diff --git a/R/rlassoIVselectX.R b/R/rlassoIVselectX.R index 12db6b4..fe359cc 100644 --- a/R/rlassoIVselectX.R +++ b/R/rlassoIVselectX.R @@ -1,4 +1,4 @@ -#' Instrumental Variable Estimation with Selection on the exogenous Variables by Lasso +#' Instrumental Variable Estimation with Selection on the exogenous Variables by Lasso #' #' #' This function estimates the coefficient of an endogenous variable by employing Instrument Variables in a setting where the exogenous variables are high-dimensional and hence @@ -7,7 +7,7 @@ #' #' The implementation is a special case of of Chernozhukov et al. (2015). #' The option \code{post=TRUE} conducts post-lasso estimation for the Lasso estimations, i.e. a refit of the -#' model with the selected variables. Exogenous variables +#' model with the selected variables. Exogenous variables #' \code{x} are automatically used as instruments and added to the #' instrument set \code{z}. #' @@ -28,56 +28,63 @@ #' @rdname rlassoIVselectX #' @examples #' library(hdm) -#' data(AJR); y = AJR$GDP; d = AJR$Exprop; z = AJR$logMort -#' x = model.matrix(~ -1 + (Latitude + Latitude2 + Africa + -#' Asia + Namer + Samer)^2, data=AJR) +#' data(AJR) +#' y <- AJR$GDP +#' d <- AJR$Exprop +#' z <- AJR$logMort +#' x <- model.matrix(~ -1 + (Latitude + Latitude2 + Africa + +#' Asia + Namer + Samer)^2, data = AJR) #' dim(x) -#' #AJR.Xselect = rlassoIV(x=x, d=d, y=y, z=z, select.X=TRUE, select.Z=FALSE) -#' AJR.Xselect = rlassoIV(GDP ~ Exprop + (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2 | -#' logMort + (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2, -#' data=AJR, select.X=TRUE, select.Z=FALSE) +#' # AJR.Xselect = rlassoIV(x=x, d=d, y=y, z=z, select.X=TRUE, select.Z=FALSE) +#' AJR.Xselect <- rlassoIV(GDP ~ Exprop + (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2 | +#' logMort + (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2, +#' data = AJR, select.X = TRUE, select.Z = FALSE +#' ) #' summary(AJR.Xselect) #' confint(AJR.Xselect) -rlassoIVselectX <- function(x, ...) - UseMethod("rlassoIVselectX") # definition generic function +rlassoIVselectX <- function(x, ...) { + UseMethod("rlassoIVselectX") +} # definition generic function #' @export #' @rdname rlassoIVselectX -rlassoIVselectX.default <- function(x,d,y,z, post=TRUE, ...) { +rlassoIVselectX.default <- function(x, d, y, z, post = TRUE, ...) { d <- as.matrix(d) z <- as.matrix(z) - if (is.null(colnames(d))) colnames(d) <- paste("d", 1:ncol(d), sep="") - if (is.null(colnames(x)) & !is.null(x)) colnames(x) <- paste("x", 1:ncol(x), sep="") - if (is.null(colnames(z)) & !is.null(z)) colnames(z) <- paste("z", 1:ncol(z), sep="") + if (is.null(colnames(d))) colnames(d) <- paste("d", 1:ncol(d), sep = "") + if (is.null(colnames(x)) & !is.null(x)) colnames(x) <- paste("x", 1:ncol(x), sep = "") + if (is.null(colnames(z)) & !is.null(z)) colnames(z) <- paste("z", 1:ncol(z), sep = "") n <- length(y) numIV <- dim(z)[2] - Z <- cbind(z,x) - lasso.d.x <- rlasso(d ~ x, post=post, ...) + Z <- cbind(z, x) + lasso.d.x <- rlasso(d ~ x, post = post, ...) Dr <- d - predict(lasso.d.x) - lasso.y.x <- rlasso(y ~ x, post=post, ...) + lasso.y.x <- rlasso(y ~ x, post = post, ...) Yr <- y - predict(lasso.y.x) - Zr <- matrix(NA, nrow=n, ncol=numIV) - - k <- 1 + dim(d)[2] + numIV + Zr <- matrix(NA, nrow = n, ncol = numIV) + + k <- 1 + dim(d)[2] + numIV selection.matrix <- matrix(NA, ncol = k, nrow = dim(x)[2]) - colnames(selection.matrix) <- c("y", colnames(d), colnames(z)) + colnames(selection.matrix) <- c("y", colnames(d), colnames(z)) rownames(selection.matrix) <- colnames(x) - selection.matrix[,1] <- lasso.y.x$index - selection.matrix[,2] <- lasso.d.x$index - - for (i in seq(length.out=numIV)) { - lasso.z.x <- rlasso(z[,i] ~ x, post=post, ...) - Zr[,i] <- z[,i] - predict(lasso.z.x) - selection.matrix[, i + 2] <- lasso.z.x$index + selection.matrix[, 1] <- lasso.y.x$index + selection.matrix[, 2] <- lasso.d.x$index + + for (i in seq(length.out = numIV)) { + lasso.z.x <- rlasso(z[, i] ~ x, post = post, ...) + Zr[, i] <- z[, i] - predict(lasso.z.x) + selection.matrix[, i + 2] <- lasso.z.x$index } - result <- tsls(y = Yr,d = Dr,x=NULL, z = Zr, intercept=FALSE) + result <- tsls(y = Yr, d = Dr, x = NULL, z = Zr, intercept = FALSE) coef <- as.vector(result$coefficient) se <- diag(sqrt(result$vcov)) vcov <- result$vcov - + names(coef) <- names(se) <- colnames(d) - res <- list(coefficients=coef, se=se, vcov=vcov, call=match.call(), samplesize=n, - selection.matrix = selection.matrix) + res <- list( + coefficients = coef, se = se, vcov = vcov, call = match.call(), samplesize = n, + selection.matrix = selection.matrix + ) class(res) <- "rlassoIVselectX" return(res) } @@ -87,15 +94,15 @@ rlassoIVselectX.default <- function(x,d,y,z, post=TRUE, ...) { #' @export #' @param formula An object of class \code{Formula} of the form " y ~ x + d | x + z" with y the outcome variable, #' d endogenous variable, z instrumental variables, and x exogenous variables. -#' @param data An optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. +#' @param data An optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. #' If not found in data, the variables are taken from environment(formula), typically the environment from which \code{rlassoIVselectX} is called. -rlassoIVselectX.formula <- function(formula, data, post=TRUE, ...) { +rlassoIVselectX.formula <- function(formula, data, post = TRUE, ...) { mat <- f.formula(formula, data, all.categories = FALSE) y <- mat$Y x <- mat$X d <- mat$D z <- mat$Z - res <- rlassoIVselectX(x=x, d=d, y=y, z=z, post=post, ...) + res <- rlassoIVselectX(x = x, d = d, y = y, z = z, post = post, ...) res$call <- match.call() return(res) } @@ -105,7 +112,7 @@ rlassoIVselectX.formula <- function(formula, data, post=TRUE, ...) { #' Methods for S3 object \code{rlassoIVselectX} #' -#' Objects of class \code{rlassoIVselectX} are constructed by \code{rlassoIVselectX}. +#' Objects of class \code{rlassoIVselectX} are constructed by \code{rlassoIVselectX}. #' \code{print.rlassoIVselectX} prints and displays some information about fitted \code{rlassoIVselectX} objects. #' \code{summary.rlassoIVselectX} summarizes information of a fitted \code{rlassoIVselectX} object. #' \code{confint.rlassoIVselectX} extracts the confidence intervals. @@ -123,10 +130,14 @@ print.rlassoIVselectX <- function(x, digits = max(3L, getOption("digits") - 3L), cat("\nCall:\n", paste(deparse(x$call), sep = "\n", collapse = "\n"), "\n\n", sep = "") if (length(coef(x))) { cat("Coefficients:\n") - print.default(format(coef(x), digits = digits), print.gap = 2L, - quote = FALSE) + print.default(format(coef(x), digits = digits), + print.gap = 2L, + quote = FALSE + ) + } + else { + cat("No coefficients\n") } - else cat("No coefficients\n") cat("\n") invisible(coef(x)) } @@ -137,15 +148,15 @@ print.rlassoIVselectX <- function(x, digits = max(3L, getOption("digits") - 3L), summary.rlassoIVselectX <- function(object, digits = max(3L, getOption("digits") - 3L), ...) { if (length(coef(object))) { k <- length(object$coefficient) - table <- matrix(NA,ncol=4,nrow=k) + table <- matrix(NA, ncol = 4, nrow = k) rownames(table) <- names(object$coefficient) colnames(table) <- c("coeff.", "se.", "t-value", "p-value") - table[,1] <- object$coefficient - table[,2] <- sqrt(diag(as.matrix(object$vcov))) - table[,3] <- table[,1]/table[,2] - table[,4] <- 2*pnorm(-abs(table[,3])) + table[, 1] <- object$coefficient + table[, 2] <- sqrt(diag(as.matrix(object$vcov))) + table[, 3] <- table[, 1] / table[, 2] + table[, 4] <- 2 * pnorm(-abs(table[, 3])) print("Estimation and significance testing of the effect of target variables in the IV regression model") - printCoefmat(table, digits=digits, P.values=TRUE, has.Pvalue=TRUE) + printCoefmat(table, digits = digits, P.values = TRUE, has.Pvalue = TRUE) cat("\n") } else { cat("No coefficients\n") @@ -157,23 +168,26 @@ summary.rlassoIVselectX <- function(object, digits = max(3L, getOption("digits") #' @rdname methods.rlassoIVselectX #' @export -confint.rlassoIVselectX <- function(object, parm, level=0.95, ...) { +confint.rlassoIVselectX <- function(object, parm, level = 0.95, ...) { n <- object$samplesize k <- length(object$coefficients) cf <- coef(object) pnames <- names(cf) - if (missing(parm)) + if (missing(parm)) { parm <- pnames - else if (is.numeric(parm)) + } else if (is.numeric(parm)) { parm <- pnames[parm] - a <- (1 - level)/2 + } + a <- (1 - level) / 2 a <- c(a, 1 - a) - #fac <- qt(a, n-k) + # fac <- qt(a, n-k) fac <- qnorm(a) pct <- format.perc(a, 3) - ci <- array(NA, dim = c(length(parm), 2L), dimnames = list(parm, - pct)) - #ses <- sqrt(diag(object$vcov))[parm] + ci <- array(NA, dim = c(length(parm), 2L), dimnames = list( + parm, + pct + )) + # ses <- sqrt(diag(object$vcov))[parm] ses <- object$se[parm] ci[] <- cf[parm] + ses %o% fac print(ci) @@ -183,41 +197,45 @@ confint.rlassoIVselectX <- function(object, parm, level=0.95, ...) { #' Coefficients from S3 objects \code{rlassoIVselectX} #' #' Method to extract coefficients and selection matrix from objects of class \code{rlassoIVselectX}. -#' +#' #' Printing coefficients and selection matrix for S3 object \code{rlassoIVselectX}. The first column of the selection matrix reports the selection index for the lasso regression of \code{y} on \code{x} in the specified #' \code{rlassoIVselectX} command. \code{"x"} indicates that a variable has been selected, i.e., the corresponding estimated coefficient is different from zero. #' The second column contains the selection index for the lasso regression of \code{d} on \code{x} and the remaining columns -#' the index of selected variables \code{x} for the instruments \code{z}. The very last column collects all variables that have been selected in at least one of the lasso regressions. -#' -#' @param object an object of class \code{rlassoIVselectX}, usually a result of a call +#' the index of selected variables \code{x} for the instruments \code{z}. The very last column collects all variables that have been selected in at least one of the lasso regressions. +#' +#' @param object an object of class \code{rlassoIVselectX}, usually a result of a call #' \code{rlassoIVselectX} or \code{rlassoIV} with options \code{select.X=TRUE} and #' \code{select.Z=FALSE}. #' @param selection.matrix if TRUE, a selection matrix is returned that indicates the selected variables from each regression. -#' Default is set to FALSE. See section on details for more information. +#' Default is set to FALSE. See section on details for more information. #' @param complete general option of the function \code{coef}. -#' @param ... further arguments passed to functions coef. +#' @param ... further arguments passed to functions coef. #' @export #' @rdname coef.rlassoIVselectX #' @examples #' \dontrun{ #' library(hdm) -#' data(AJR); y = AJR$GDP; d = AJR$Exprop; z = AJR$logMort -#' x = model.matrix(~ -1 + (Latitude + Latitude2 + Africa + -#' Asia + Namer + Samer)^2, data=AJR) -#' AJR.Xselect = rlassoIV(GDP ~ Exprop + (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2 | -#' logMort + (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2, -#' data=AJR, select.X=TRUE, select.Z=FALSE) +#' data(AJR) +#' y <- AJR$GDP +#' d <- AJR$Exprop +#' z <- AJR$logMort +#' x <- model.matrix(~ -1 + (Latitude + Latitude2 + Africa + +#' Asia + Namer + Samer)^2, data = AJR) +#' AJR.Xselect <- rlassoIV(GDP ~ Exprop + (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2 | +#' logMort + (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2, +#' data = AJR, select.X = TRUE, select.Z = FALSE +#' ) #' coef(AJR.Xselect) # Default behavior #' coef(AJR.Xselect, selection.matrix = TRUE) # print selection matrix #' } -coef.rlassoIVselectX <- function(object, complete = TRUE, selection.matrix = FALSE, ...){ - +coef.rlassoIVselectX <- function(object, complete = TRUE, selection.matrix = FALSE, ...) { + cf <- object$coefficients - + if (selection.matrix == TRUE) { - + mat <- object$selection.matrix - + dmat2 <- dim(mat)[2] rnames <- rownames(mat) mat <- cbind(mat, as.logical(apply(mat, 1, sum))) @@ -227,24 +245,24 @@ coef.rlassoIVselectX <- function(object, complete = TRUE, selection.matrix = FA mat <- apply(mat, 2, function(x) gsub(0, ".", x)) # mat[is.na(mat)] <- "-" rownames(mat) <- c(rnames, "sum") - + if (complete) { coef <- list(cf = cf, selection.matrix = mat) return(coef) } - + else { coef <- list(cf = cf[!is.na(cf)], selection.matrix = mat) return(coef) - } + } } else { if (complete) { return(cf) } - + else { return(cf[!is.na(cf)]) - } + } } } diff --git a/R/rlassoIVselectZ.R b/R/rlassoIVselectZ.R index c681755..b8e03dd 100644 --- a/R/rlassoIVselectZ.R +++ b/R/rlassoIVselectZ.R @@ -26,40 +26,43 @@ #' Sparse models and methods for optimal instruments with an application to #' eminent domain. \emph{Econometrica} 80 (6), 2369--2429. #' @export -rlassoIVselectZ <- function(x, ...) - UseMethod("rlassoIVselectZ") # definition generic function +rlassoIVselectZ <- function(x, ...) { + UseMethod("rlassoIVselectZ") +} # definition generic function #' @export #' @rdname rlassoIVselectZ rlassoIVselectZ.default <- function(x, d, y, z, post = TRUE, intercept = TRUE, ...) { - + d <- as.matrix(d) if (is.vector(x)) x <- as.matrix(x) n <- length(y) kex <- dim(x)[2] ke <- dim(d)[2] - - if (is.null(colnames(d))) + + if (is.null(colnames(d))) { colnames(d) <- paste("d", 1:ke, sep = "") - if (is.null(colnames(x)) & !is.null(x)) + } + if (is.null(colnames(x)) & !is.null(x)) { colnames(x) <- paste("x", 1:kex, sep = "") - - Z <- cbind(z,x) # including the x-variables as instruments + } + + Z <- cbind(z, x) # including the x-variables as instruments kiv <- dim(Z)[2] select.mat <- NULL # matrix with the selected variables - + # first stage regression Dhat <- NULL flag.const <- 0 for (i in 1:ke) { di <- d[, i] - #lasso.fit <- rlasso(di ~ Z, post = post, intercept = intercept, ...) - lasso.fit <- rlasso(y=di, x=Z, post = post, intercept = intercept, ...) + # lasso.fit <- rlasso(di ~ Z, post = post, intercept = intercept, ...) + lasso.fit <- rlasso(y = di, x = Z, post = post, intercept = intercept, ...) if (sum(lasso.fit$ind) == 0) { - dihat <- rep(mean(di), n) #dihat <- mean(di) + dihat <- rep(mean(di), n) # dihat <- mean(di) flag.const <- flag.const + 1 - if (flag.const >1) message("No variables selected for two or more instruments leading to multicollinearity problems.") - #intercept <- FALSE # to avoid multicollineariry + if (flag.const > 1) message("No variables selected for two or more instruments leading to multicollinearity problems.") + # intercept <- FALSE # to avoid multicollineariry select.mat <- cbind(select.mat, FALSE) } else { # dihat <- z%*%lasso.fit$coefficients @@ -69,43 +72,47 @@ rlassoIVselectZ.default <- function(x, d, y, z, post = TRUE, intercept = TRUE, . Dhat <- cbind(Dhat, dihat) } colnames(select.mat) <- colnames(d) - #if (intercept) { #? - # Dhat <- cbind(Dhat, 1, x) + # if (intercept) { #? + # Dhat <- cbind(Dhat, 1, x) # d <- cbind(d, 1, x) - #} else { + # } else { # Dhat <- cbind(Dhat, x) # d <- cbind(d, x) - #} - + # } + Dhat <- cbind(Dhat, x) d <- cbind(d, x) - + # calculation coefficients - #alpha.hat <- solve(t(Dhat) %*% d) %*% (t(Dhat) %*% y) - alpha.hat <- MASS::ginv(t(Dhat)%*%d)%*%(t(Dhat)%*%y) + # alpha.hat <- solve(t(Dhat) %*% d) %*% (t(Dhat) %*% y) + alpha.hat <- MASS::ginv(t(Dhat) %*% d) %*% (t(Dhat) %*% y) # calcualtion of the variance-covariance matrix residuals <- y - d %*% alpha.hat - #Omega.hat <- t(Dhat) %*% diag(as.vector(residuals^2)) %*% Dhat # Dhat.e <- Dhat*as.vector(residuals); Omega.hat <- t(Dhat.e)%*%Dhat.e - Omega.hat <- t(Dhat) %*% (Dhat*as.vector(residuals^2)) - Q.hat.inv <- MASS::ginv(t(d) %*% Dhat) #solve(t(d)%*%Dhat) + # Omega.hat <- t(Dhat) %*% diag(as.vector(residuals^2)) %*% Dhat # Dhat.e <- Dhat*as.vector(residuals); Omega.hat <- t(Dhat.e)%*%Dhat.e + Omega.hat <- t(Dhat) %*% (Dhat * as.vector(residuals^2)) + Q.hat.inv <- MASS::ginv(t(d) %*% Dhat) # solve(t(d)%*%Dhat) vcov <- Q.hat.inv %*% Omega.hat %*% t(Q.hat.inv) rownames(alpha.hat) <- c(colnames(d)) colnames(vcov) <- rownames(vcov) <- rownames(alpha.hat) - - if (is.null(x)){ - res <- list(coefficients = alpha.hat[1:ke, ], se = sqrt(diag(vcov))[1:ke], - vcov = vcov[1:ke, 1:ke, drop = FALSE], residuals = residuals, samplesize = n, selected = select.mat, - selection.matrix = select.mat, - call = match.call()) + + if (is.null(x)) { + res <- list( + coefficients = alpha.hat[1:ke, ], se = sqrt(diag(vcov))[1:ke], + vcov = vcov[1:ke, 1:ke, drop = FALSE], residuals = residuals, samplesize = n, selected = select.mat, + selection.matrix = select.mat, + call = match.call() + ) } - else{ - if (is.null(x) == FALSE){ - res <- list(coefficients = alpha.hat[1:ke, ], se = sqrt(diag(vcov))[1:ke], - vcov = vcov[1:ke, 1:ke, drop = FALSE], - coefficients.controls = alpha.hat[(ke + 1):(ke + kex), ], se.controls = sqrt(diag(vcov))[(ke + 1):(ke + kex)], - vcov.controls = vcov[(ke + 1):(ke + kex), (ke + 1):(ke + kex), drop = FALSE], - residuals = residuals, samplesize = n, selected = select.mat, selection.matrix = select.mat, - call = match.call()) + else { + if (is.null(x) == FALSE) { + res <- list( + coefficients = alpha.hat[1:ke, ], se = sqrt(diag(vcov))[1:ke], + vcov = vcov[1:ke, 1:ke, drop = FALSE], + coefficients.controls = alpha.hat[(ke + 1):(ke + kex), ], se.controls = sqrt(diag(vcov))[(ke + 1):(ke + kex)], + vcov.controls = vcov[(ke + 1):(ke + kex), (ke + 1):(ke + kex), drop = FALSE], + residuals = residuals, samplesize = n, selected = select.mat, selection.matrix = select.mat, + call = match.call() + ) } } class(res) <- "rlassoIVselectZ" @@ -117,16 +124,16 @@ rlassoIVselectZ.default <- function(x, d, y, z, post = TRUE, intercept = TRUE, . #' @export #' @param formula An object of class \code{Formula} of the form " y ~ x + d | x + z" with y the outcome variable, #' d endogenous variable, z instrumental variables, and x exogenous variables. -#' @param data An optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. +#' @param data An optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. #' If not found in data, the variables are taken from environment(formula), typically the environment from which \code{rlassoIVselectZ} is called. -rlassoIVselectZ.formula <- function(formula, data, post=TRUE, intercept = TRUE, ...) { +rlassoIVselectZ.formula <- function(formula, data, post = TRUE, intercept = TRUE, ...) { mat <- f.formula(formula, data, all.categories = FALSE) y <- mat$Y x <- mat$X d <- mat$D z <- mat$Z - - res <- rlassoIVselectZ(x=x,d=d,y=y,z=z, post=post, intercept=intercept, ...) + + res <- rlassoIVselectZ(x = x, d = d, y = y, z = z, post = post, intercept = intercept, ...) res$call <- match.call() return(res) } @@ -134,7 +141,7 @@ rlassoIVselectZ.formula <- function(formula, data, post=TRUE, intercept = TRUE, #' Methods for S3 object \code{rlassoIVselectZ} #' -#' Objects of class \code{rlassoIVselectZ} are constructed by \code{rlassoIVselectZ}. +#' Objects of class \code{rlassoIVselectZ} are constructed by \code{rlassoIVselectZ}. #' \code{print.rlassoIVselectZ} prints and displays some information about fitted \code{rlassoIVselectZ} objects. #' \code{summary.rlassoIVselectZ} summarizes information of a fitted \code{rlassoIVselectZ} object. #' \code{confint.rlassoIVselectZ} extracts the confidence intervals. @@ -148,15 +155,21 @@ rlassoIVselectZ.formula <- function(formula, data, post=TRUE, intercept = TRUE, #' @aliases methods.rlassoIVselectZ print.rlassoIVselectZ summary.rlassoIVselectZ #' @export -print.rlassoIVselectZ <- function(x, digits = max(3L, getOption("digits") - - 3L), ...) { - cat("\nCall:\n", paste(deparse(x$call), sep = "\n", collapse = "\n"), - "\n\n", sep = "") +print.rlassoIVselectZ <- function(x, digits = max(3L, getOption("digits") - + 3L), ...) { + cat("\nCall:\n", paste(deparse(x$call), sep = "\n", collapse = "\n"), + "\n\n", + sep = "" + ) if (length(coef(x))) { cat("Coefficients:\n") - print.default(format(coef(x), digits = digits), print.gap = 2L, - quote = FALSE) - } else cat("No coefficients\n") + print.default(format(coef(x), digits = digits), + print.gap = 2L, + quote = FALSE + ) + } else { + cat("No coefficients\n") + } cat("\n") invisible(coef(x)) } @@ -164,8 +177,8 @@ print.rlassoIVselectZ <- function(x, digits = max(3L, getOption("digits") - #' @rdname methods.rlassoIVselectZ #' @export -summary.rlassoIVselectZ <- function(object, digits = max(3L, getOption("digits") - - 3L), ...) { +summary.rlassoIVselectZ <- function(object, digits = max(3L, getOption("digits") - + 3L), ...) { if (length(coef(object))) { k <- length(object$coefficients) table <- matrix(NA, ncol = 4, nrow = k) @@ -173,7 +186,7 @@ summary.rlassoIVselectZ <- function(object, digits = max(3L, getOption("digits") colnames(table) <- c("coeff.", "se.", "t-value", "p-value") table[, 1] <- object$coefficients table[, 2] <- sqrt(diag(as.matrix(object$vcov))) - table[, 3] <- table[, 1]/table[, 2] + table[, 3] <- table[, 1] / table[, 2] table[, 4] <- 2 * pnorm(-abs(table[, 3])) print("Estimates and significance testing of the effect of target variables in the IV regression model") printCoefmat(table, digits = digits, P.values = TRUE, has.Pvalue = TRUE) @@ -193,10 +206,12 @@ confint.rlassoIVselectZ <- function(object, parm, level = 0.95, ...) { k <- length(object$coefficients) cf <- coef(object) pnames <- names(cf) - if (missing(parm)) - parm <- pnames else if (is.numeric(parm)) - parm <- pnames[parm] - a <- (1 - level)/2 + if (missing(parm)) { + parm <- pnames + } else if (is.numeric(parm)) { + parm <- pnames[parm] + } + a <- (1 - level) / 2 a <- c(a, 1 - a) # fac <- qt(a, n-k) fac <- qnorm(a) @@ -212,40 +227,40 @@ confint.rlassoIVselectZ <- function(object, parm, level = 0.95, ...) { #' Coefficients from S3 objects \code{rlassoIVselectZ} #' #' Method to extract coefficients from objects of class \code{rlassoIVselectZ}. -#' +#' #' Printing coefficients and selection matrix for S3 object \code{rlassoIVselectZ}. The columns of the selection matrix report the selection index for the first stage lasso regressions as specified #' \code{rlassoIVselectZ} command, i.e., the selected variables for each of the endogenous variables. \code{"x"} indicates that a variable has been selected, i.e., the corresponding estimated coefficient is different from zero. -#' The very last column collects all variables that have been selected in at least one of the lasso regressions. -#' -#' @param object an object of class \code{rlassoIVselectZ}, usually a result of a call +#' The very last column collects all variables that have been selected in at least one of the lasso regressions. +#' +#' @param object an object of class \code{rlassoIVselectZ}, usually a result of a call #' \code{rlassoIVselectZ} or \code{rlassoIV} with options \code{select.X=FALSE} and #' \code{select.Z=TRUE}. #' @param selection.matrix if TRUE, a selection matrix is returned that indicates the selected variables from each first stage regression. -#' Default is set to FALSE. See section on details for more information. +#' Default is set to FALSE. See section on details for more information. #' @param complete general option of the function \code{coef}. -#' @param ... further arguments passed to functions coef. +#' @param ... further arguments passed to functions coef. #' @export #' @rdname coef.rlassoIVselectZ #' @examples #' \dontrun{ -#' lasso.IV.Z = rlassoIVselectZ(x=x, d=d, y=y, z=z) +#' lasso.IV.Z <- rlassoIVselectZ(x = x, d = d, y = y, z = z) #' data(EminentDomain) #' z <- EminentDomain$logGDP$z # instruments #' x <- EminentDomain$logGDP$x # exogenous variables #' y <- EminentDomain$logGDP$y # outcome varialbe #' d <- EminentDomain$logGDP$d # treatment / endogenous variable -#' lasso.IV.Z = rlassoIVselectZ(x=x, d=d, y=y, z=z) +#' lasso.IV.Z <- rlassoIVselectZ(x = x, d = d, y = y, z = z) #' coef(lasso.IV.Z) # Default behavior #' coef(lasso.IV.Z, selection.matrix = T) #' } -coef.rlassoIVselectZ <- function(object, complete = TRUE, selection.matrix = FALSE, ...){ - +coef.rlassoIVselectZ <- function(object, complete = TRUE, selection.matrix = FALSE, ...) { + cf <- object$coefficients - + if (selection.matrix == TRUE) { - + mat <- object$selection.matrix - + dmat2 <- dim(mat)[2] rnames <- rownames(mat) mat <- cbind(mat, as.logical(apply(mat, 1, sum))) @@ -255,26 +270,24 @@ coef.rlassoIVselectZ <- function(object, complete = TRUE, selection.matrix = FA mat <- apply(mat, 2, function(x) gsub(0, ".", x)) # mat[is.na(mat)] <- "-" rownames(mat) <- c(rnames, "sum") - + if (complete) { coef <- list(cf = cf, selection.matrix = mat) return(coef) } - + else { coef <- list(cf = cf[!is.na(cf)], selection.matrix = mat) return(coef) - } + } } else { if (complete) { return(cf) } - + else { return(cf[!is.na(cf)]) - } + } } } - - diff --git a/R/rlassologit.R b/R/rlassologit.R index 53b7c3e..44f391c 100644 --- a/R/rlassologit.R +++ b/R/rlassologit.R @@ -35,30 +35,30 @@ #' @export #' @rdname rlassologit #' @examples -#'\dontrun{ +#' \dontrun{ #' library(hdm) #' ## DGP #' set.seed(2) #' n <- 250 #' p <- 100 #' px <- 10 -#' X <- matrix(rnorm(n*p), ncol=p) -#' beta <- c(rep(2,px), rep(0,p-px)) +#' X <- matrix(rnorm(n * p), ncol = p) +#' beta <- c(rep(2, px), rep(0, p - px)) #' intercept <- 1 -#' P <- exp(intercept + X %*% beta)/(1+exp(intercept + X %*% beta)) -#' y <- rbinom(length(y), size=1, prob=P) +#' P <- exp(intercept + X %*% beta) / (1 + exp(intercept + X %*% beta)) +#' y <- rbinom(length(y), size = 1, prob = P) #' ## fit rlassologit object -#' rlassologit.reg <- rlassologit(y~X) +#' rlassologit.reg <- rlassologit(y ~ X) #' ## methods -#' summary(rlassologit.reg, all=F) +#' summary(rlassologit.reg, all = F) #' print(rlassologit.reg) -#' predict(rlassologit.reg, type='response') -#' X3 <- matrix(rnorm(n*p), ncol=p) -#' predict(rlassologit.reg, newdata=X3) +#' predict(rlassologit.reg, type = "response") +#' X3 <- matrix(rnorm(n * p), ncol = p) +#' predict(rlassologit.reg, newdata = X3) #' } rlassologit <- function(x, ...) { UseMethod("rlassologit") - }# definition generic function +} # definition generic function #' @param formula an object of class 'formula' (or one that can be coerced to #' that class): a symbolic description of the model to be fitted in the form @@ -66,8 +66,10 @@ rlassologit <- function(x, ...) { #' @param data an optional data frame, list or environment. #' @export #' @rdname rlassologit -rlassologit.formula <- function(formula, data = NULL, post = TRUE, intercept = TRUE, model = TRUE, penalty = list(lambda = NULL, - c = 1.1, gamma = 0.1/log(n)), control = list(threshold = NULL), ...) { +rlassologit.formula <- function(formula, data = NULL, post = TRUE, intercept = TRUE, model = TRUE, penalty = list( + lambda = NULL, + c = 1.1, gamma = 0.1 / log(n) + ), control = list(threshold = NULL), ...) { cl <- match.call() mf <- match.call(expand.dots = FALSE) m <- match(c("formula", "data"), names(mf), 0L) @@ -79,99 +81,114 @@ rlassologit.formula <- function(formula, data = NULL, post = TRUE, intercept = T attr(mt, "intercept") <- 1 y <- model.response(mf, "numeric") n <- length(y) - x <- model.matrix(mt, mf)[,-1, drop=FALSE] + x <- model.matrix(mt, mf)[, -1, drop = FALSE] if (missing(data)) { if (is.call(formula[[3]])) { colnames(x) <- gsub(re.escape(format(formula[[3]])), "", colnames(x)) } else { - colnames(x) <- gsub(re.escape(formula[[3]]), "", colnames(x)) + colnames(x) <- gsub(re.escape(formula[[3]]), "", colnames(x)) } } - est <- rlassologit(x, y, post = post, intercept = intercept, model = model, penalty = penalty, - control = control, ...) + est <- rlassologit(x, y, + post = post, intercept = intercept, model = model, penalty = penalty, + control = control, ... + ) est$call <- cl return(est) } #' @export #' @rdname rlassologit -rlassologit.character <- function(x, data = NULL, post = TRUE, intercept = TRUE, model = TRUE, penalty = list(lambda = NULL, - c = 1.1, gamma = 0.1/log(n)), control = list(threshold = NULL), ...) { +rlassologit.character <- function(x, data = NULL, post = TRUE, intercept = TRUE, model = TRUE, penalty = list( + lambda = NULL, + c = 1.1, gamma = 0.1 / log(n) + ), control = list(threshold = NULL), ...) { formula <- as.formula(x) - res <- rlassologit.formula(formula, data = data, post = post, intercept = intercept, model = model, penalty = penalty, control = control, ...) + res <- rlassologit.formula(formula, data = data, post = post, intercept = intercept, model = model, penalty = penalty, control = control, ...) return(res) } #' @rdname rlassologit #' @export -rlassologit.default <- function(x, y, post = TRUE, intercept = TRUE, model = TRUE, penalty = list(lambda = NULL, - c = 1.1, gamma = 0.1/log(n)), control = list(threshold = NULL), ...) { +rlassologit.default <- function(x, y, post = TRUE, intercept = TRUE, model = TRUE, penalty = list( + lambda = NULL, + c = 1.1, gamma = 0.1 / log(n) + ), control = list(threshold = NULL), ...) { n <- dim(x)[1] p <- dim(x)[2] - if (is.null(colnames(x))) + if (is.null(colnames(x))) { colnames(x) <- paste("V", 1:p, sep = "") + } ind.names <- 1:p - + if (!exists("c", where = penalty)) { - if (post==TRUE) { - penalty$c = 1.1 + if (post == TRUE) { + penalty$c <- 1.1 } else { - penalty$c = 0.5 + penalty$c <- 0.5 } } - + if (!exists("gamma", where = penalty)) { - penalty$gamma = 0.1/log(n) + penalty$gamma <- 0.1 / log(n) } - + if (is.null(penalty$gamma)) { - penalty$gamma = 0.1/log(n) + penalty$gamma <- 0.1 / log(n) } - + if (!is.null(penalty$lambda)) { - lambda <- penalty$lambda/(2 * n) + lambda <- penalty$lambda / (2 * n) lambda0 <- lambda * (2 * n) } else { - lambda0 <- penalty$c/2 * sqrt(n) * qnorm(1 - penalty$gamma/(2 * p)) - #lambda0 <- penalty$c/2 * sqrt(n) * qnorm(1 - penalty$gamma/(max(n, p*log(n)))) - lambda <- lambda0/(2 * n) + lambda0 <- penalty$c / 2 * sqrt(n) * qnorm(1 - penalty$gamma / (2 * p)) + # lambda0 <- penalty$c/2 * sqrt(n) * qnorm(1 - penalty$gamma/(max(n, p*log(n)))) + lambda <- lambda0 / (2 * n) } - + s0 <- sqrt(var(y)) # calculation parameters - #xs <- scale(x, center = FALSE, scale = TRUE) # to prevent "double" scaling, removed also from next line - log.lasso <- glmnet::glmnet(x, y, family = c("binomial"), alpha = 1, - lambda = lambda[1], standardize = TRUE, intercept = intercept) + # xs <- scale(x, center = FALSE, scale = TRUE) # to prevent "double" scaling, removed also from next line + log.lasso <- glmnet::glmnet(x, y, + family = c("binomial"), alpha = 1, + lambda = lambda[1], standardize = TRUE, intercept = intercept + ) coefTemp <- as.vector(log.lasso$beta) coefTemp[is.na(coefTemp)] <- 0 ind1 <- (abs(coefTemp) > 0) x1 <- as.matrix(x[, ind1, drop = FALSE]) if (dim(x1)[2] == 0) { if (intercept == TRUE) { - a0 <- log(mean(y)/(1 - mean(y))) + a0 <- log(mean(y) / (1 - mean(y))) res <- y - mean(y) - coefs <- c(a0, rep(0,p)) + coefs <- c(a0, rep(0, p)) names(coefTemp) <- names(ind1) <- colnames(x) names(coefs) <- c("(Intercept)", names(coefTemp)) } - + if (intercept == FALSE) { - a0 <- 0 # or NA? + a0 <- 0 # or NA? res <- y - 0.5 message("Residuals not defined, set to 0.5") - coefs <- rep(0,p) + coefs <- rep(0, p) names(coefTemp) <- names(ind1) <- colnames(x) names(coefs) <- names(coefTemp) } - est <- list(coefficients = coefs, beta = coefTemp, intercept = a0, index = rep(FALSE, - p), s0 = s0, lambda0 = lambda0, residuals = res, sigma = sqrt(var(res)), - call = match.call(), options = list(post = post, intercept = intercept, - control = control)) + est <- list( + coefficients = coefs, beta = coefTemp, intercept = a0, index = rep( + FALSE, + p + ), s0 = s0, lambda0 = lambda0, residuals = res, sigma = sqrt(var(res)), + call = match.call(), options = list( + post = post, intercept = intercept, + control = control + ) + ) if (model) est$model <- x class(est) <- c("rlassologit") return(est) } - + # refinement variance estimation if (post) { if (intercept) { @@ -183,7 +200,7 @@ rlassologit.default <- function(x, y, post = TRUE, intercept = TRUE, model = TR e1 <- y - reg$fitted.values coefTemp[ind1] <- coefT } - + if (!intercept) { reg <- glm(y ~ -1 + x1, family = binomial(link = "logit")) coefT <- coef(reg) @@ -195,38 +212,44 @@ rlassologit.default <- function(x, y, post = TRUE, intercept = TRUE, model = TR if (!post) { e1 <- y - predict(log.lasso, newx = x, type = "response") } - + coefTemp <- as.vector(coefTemp) coefTemp[abs(coefTemp) < control$threshold] <- 0 ind1 <- as.vector(ind1) names(coefTemp) <- names(ind1) <- colnames(x) - - + + if (intercept == TRUE) { - if (post == TRUE) + if (post == TRUE) { a0 <- coef(reg)[1] - if (post == FALSE) + } + if (post == FALSE) { a0 <- coef(log.lasso)[1] - + } + coefs <- c(a0, coefTemp) names(coefs)[1] <- "(Intercept)" } - + if (intercept == FALSE) { - a0 <- 0 # or NA? + a0 <- 0 # or NA? coefs <- coefTemp } - - est <- list(coefficients = coefs, beta = coefTemp, intercept=a0, index = ind1, lambda0 = lambda0, - residuals = e1, sigma = sqrt(var(e1)), call = match.call(), options = list(post = post, - intercept = intercept, control = control)) + + est <- list( + coefficients = coefs, beta = coefTemp, intercept = a0, index = ind1, lambda0 = lambda0, + residuals = e1, sigma = sqrt(var(e1)), call = match.call(), options = list( + post = post, + intercept = intercept, control = control + ) + ) if (model) est$model <- x class(est) <- c("rlassologit") return(est) } -############################################################################################################################### +############################################################################################################################### ################# Methods for logistic Lasso @@ -248,7 +271,7 @@ rlassologit.default <- function(x, y, post = TRUE, intercept = TRUE, model = TR #' @aliases methods.rlassologit print.rlassologit summary.rlassologit predict.rlassologit model.matrix.rlassologit #' @export -# predict.rlassologit <- function(object, newdata = NULL, type = "response", +# predict.rlassologit <- function(object, newdata = NULL, type = "response", # ...) { # if (missing(newdata) || is.null(newdata)) { # if (is.matrix(model.matrix(object))) { @@ -275,7 +298,7 @@ rlassologit.default <- function(x, y, post = TRUE, intercept = TRUE, model = TR # } else { # X <- as.matrix(newdata[, varcoefbasis]) # } -# +# # } else { # #X <- as.matrix(newdata) # formula <- eval(object$call[[2]]) @@ -284,34 +307,34 @@ rlassologit.default <- function(x, y, post = TRUE, intercept = TRUE, model = TR # X <- X[,-object$options$ind.scale] # } # stopifnot(ncol(X)==length(object$coefficients)) -# +# # } # } # n <- dim(X)[1] #length(object$residuals) # beta <- object$coefficients # if (object$options[["intercept"]]) { # yp <- object$a0 + as.matrix(X) %*% as.vector(beta) -# if (dim(X)[2] == 0) +# if (dim(X)[2] == 0) # yp <- rep(object$a0, n) -# if (type == "response") +# if (type == "response") # yhat <- 1/(1 + exp(-yp)) -# if (type == "link") +# if (type == "link") # yhat <- yp # } # if (!object$options[["intercept"]]) { # yp <- X %*% as.vector(beta) -# if (dim(X)[2] == 0) +# if (dim(X)[2] == 0) # yp <- rep(0, n) # yhat <- 1/(1 + exp(-yp)) -# if (type == "response") +# if (type == "response") # yhat <- 1/(1 + exp(-yp)) -# if (type == "link") +# if (type == "link") # yhat <- yp # } # return(yhat) # } -predict.rlassologit <- function (object, newdata = NULL, type = "response", ...){ +predict.rlassologit <- function(object, newdata = NULL, type = "response", ...) { # if (missing(newdata) || is.null(newdata)) { # X <- model.matrix(object) # if (sum(object$options$ind.scale) != 0) { @@ -336,8 +359,8 @@ predict.rlassologit <- function (object, newdata = NULL, type = "response", ...) # } mf <- match.call(expand.dots = TRUE) m <- match("newx", names(mf), 0L) - if (m!=0L) stop("Please use argument \"newdata\" instead of \"newx\" to provide data for prediction.") - + if (m != 0L) stop("Please use argument \"newdata\" instead of \"newx\" to provide data for prediction.") + k <- length(object$beta) if (missing(newdata) || is.null(newdata)) { X <- model.matrix(object) @@ -350,19 +373,19 @@ predict.rlassologit <- function (object, newdata = NULL, type = "response", ...) if (dim(X)[2] != k) { stop("No variable names provided in newdata and number of parameters does not fit!") } else { - #message("No variable names provided in newdata. Prediction relies on right ordering of the variables.") + # message("No variable names provided in newdata. Prediction relies on right ordering of the variables.") } } else { varcoef <- names(object$beta) - if (all(is.element(varcoef, colnames(newdata)))){ + if (all(is.element(varcoef, colnames(newdata)))) { X <- as.matrix(newdata[, varcoef]) } else { mod.frame <- as.data.frame(cbind(rep(1, nrow(newdata)), newdata)) colnames(mod.frame)[1] <- as.character(eval(object$call$formula)[[2]]) X <- try(model.matrix(eval(object$call$formula), data = mod.frame)[, -1, drop = FALSE]) - if(inherits(X, "try-error")){ + if (inherits(X, "try-error")) { stop("newdata does not contain the variables specified in formula") - } + } } } } @@ -373,22 +396,28 @@ predict.rlassologit <- function (object, newdata = NULL, type = "response", ...) beta <- object$beta if (object$options[["intercept"]]) { yp <- object$intercept + X %*% as.vector(beta) - if (dim(X)[2] == 0) + if (dim(X)[2] == 0) { yp <- rep(object$intercept, n) - if (type == "response") - yhat <- 1/(1 + exp(-yp)) - if (type == "link") + } + if (type == "response") { + yhat <- 1 / (1 + exp(-yp)) + } + if (type == "link") { yhat <- yp + } } if (!object$options[["intercept"]]) { yp <- X %*% as.vector(beta) - if (dim(X)[2] == 0) + if (dim(X)[2] == 0) { yp <- rep(0, n) - yhat <- 1/(1 + exp(-yp)) - if (type == "response") - yhat <- 1/(1 + exp(-yp)) - if (type == "link") + } + yhat <- 1 / (1 + exp(-yp)) + if (type == "response") { + yhat <- 1 / (1 + exp(-yp)) + } + if (type == "link") { yhat <- yp + } } return(yhat) } @@ -397,7 +426,7 @@ predict.rlassologit <- function (object, newdata = NULL, type = "response", ...) #' @export # model.matrix.rlassologit <- function(object, ...) { -# +# # # falls formula # if (is.call(object$call[[2]])) { # problem when formula handed as expression # # falls kein Datensatz uebergeben @@ -418,12 +447,12 @@ predict.rlassologit <- function (object, newdata = NULL, type = "response", ...) # return(mm) # } -model.matrix.rlassologit <- function(object, ...){ - if(is.null(object$model)){ - if (!is.null(object$call$x)){ +model.matrix.rlassologit <- function(object, ...) { + if (is.null(object$model)) { + if (!is.null(object$call$x)) { mm <- as.matrix(eval(object$call$x)) } else { - if(!is.null(object$call$data)){ + if (!is.null(object$call$data)) { mm <- model.matrix(eval(object$call$formula), data = eval(object$call$data))[, -1, drop = FALSE] } else { mm <- model.matrix(eval(object$call$formula))[, -1, drop = FALSE] @@ -439,33 +468,43 @@ model.matrix.rlassologit <- function(object, ...){ #' @rdname methods.rlassologit #' @export -print.rlassologit <- function(x, all = TRUE, digits = max(3L, getOption("digits") - - 3L), ...) { - cat("\nCall:\n", paste(deparse(x$call), sep = "\n", collapse = "\n"), - "\n\n", sep = "") +print.rlassologit <- function(x, all = TRUE, digits = max(3L, getOption("digits") - + 3L), ...) { + cat("\nCall:\n", paste(deparse(x$call), sep = "\n", collapse = "\n"), + "\n\n", + sep = "" + ) if (length(coef(x))) { coeffs <- coef(x) - #if (x$options$intercept) { + # if (x$options$intercept) { # coeffs <- c(x$intercept, coeffs) # names(coeffs)[1] <- "(Intercept)" # index <- cbind(1, x$index) - #} + # } if (all) { cat("Coefficients:\n") - print.default(format(coeffs, digits = digits), print.gap = 2L, - quote = FALSE) + print.default(format(coeffs, digits = digits), + print.gap = 2L, + quote = FALSE + ) } else { - #print.default(format(coeffs[index], digits = digits), print.gap = 2L, + # print.default(format(coeffs[index], digits = digits), print.gap = 2L, # quote = FALSE) if (x$options$intercept) { - print.default(format(coef(x)[c(TRUE,x$index)], digits = digits), print.gap = 2L, - quote = FALSE) + print.default(format(coef(x)[c(TRUE, x$index)], digits = digits), + print.gap = 2L, + quote = FALSE + ) } else { - print.default(format(beta$x[x$index], digits = digits), print.gap = 2L, - quote = FALSE) + print.default(format(beta$x[x$index], digits = digits), + print.gap = 2L, + quote = FALSE + ) } } - } else cat("No coefficients\n") + } else { + cat("No coefficients\n") + } cat("\n") invisible(x) } @@ -473,12 +512,15 @@ print.rlassologit <- function(x, all = TRUE, digits = max(3L, getOption("digits" #' @rdname methods.rlassologit #' @export -summary.rlassologit <- function(object, all = TRUE, digits = max(3L, getOption("digits") - - 3L), ...) { - cat("\nCall:\n", paste(deparse(object$call), sep = "\n", collapse = "\n"), - "\n", sep = "") - cat("\nPost-Lasso Estimation: ", paste(deparse(object$options$post), - sep = "\n", collapse = "\n"), "\n", sep = " ") +summary.rlassologit <- function(object, all = TRUE, digits = max(3L, getOption("digits") - + 3L), ...) { + cat("\nCall:\n", paste(deparse(object$call), sep = "\n", collapse = "\n"), + "\n", + sep = "" + ) + cat("\nPost-Lasso Estimation: ", paste(deparse(object$options$post), + sep = "\n", collapse = "\n" + ), "\n", sep = " ") coefs <- object$coefficients p <- length(object$beta) num.selected <- sum(abs(object$beta) > 0) @@ -497,7 +539,7 @@ summary.rlassologit <- function(object, all = TRUE, digits = max(3L, getOption(" coefm[, 1] <- coefs colnames(coefm) <- "Estimate" rownames(coefm) <- names(coefs) - printCoefmat(coefm, digits = digits, na.print = "NA") #, P.values=TRUE, has.Pvalue=TRUE) + printCoefmat(coefm, digits = digits, na.print = "NA") # , P.values=TRUE, has.Pvalue=TRUE) } cat("\n") invisible(object) diff --git a/R/rlassologitEffect.R b/R/rlassologitEffect.R index 3d2780d..453b914 100644 --- a/R/rlassologitEffect.R +++ b/R/rlassologitEffect.R @@ -22,29 +22,30 @@ #' cemmap working paper CWP67/13. #' @export #' @rdname rlassologitEffects -rlassologitEffects <- function(x, ...) - UseMethod("rlassologitEffects") # definition generic function +rlassologitEffects <- function(x, ...) { + UseMethod("rlassologitEffects") +} # definition generic function #' @export #' @rdname rlassologitEffects #' @examples -#'\dontrun{ +#' \dontrun{ #' library(hdm) #' ## DGP #' set.seed(2) #' n <- 250 #' p <- 100 #' px <- 10 -#' X <- matrix(rnorm(n*p), ncol=p) -#' colnames(X) = paste("V", 1:p, sep="") -#' beta <- c(rep(2,px), rep(0,p-px)) +#' X <- matrix(rnorm(n * p), ncol = p) +#' colnames(X) <- paste("V", 1:p, sep = "") +#' beta <- c(rep(2, px), rep(0, p - px)) #' intercept <- 1 -#' P <- exp(intercept + X %*% beta)/(1+exp(intercept + X %*% beta)) -#' y <- rbinom(n, size=1, prob=P) -#' xd <- X[,2:50] -#' d <- X[,1] -#' logit.effect <- rlassologitEffect(x=xd, d=d, y=y) -#' logit.effects <- rlassologitEffects(X,y, index=c(1,2,40)) +#' P <- exp(intercept + X %*% beta) / (1 + exp(intercept + X %*% beta)) +#' y <- rbinom(n, size = 1, prob = P) +#' xd <- X[, 2:50] +#' d <- X[, 1] +#' logit.effect <- rlassologitEffect(x = xd, d = d, y = y) +#' logit.effects <- rlassologitEffects(X, y, index = c(1, 2, 40)) #' logit.effects.f <- rlassologitEffects(y ~ X, I = ~ V1 + V2) #' } rlassologitEffects.default <- function(x, y, index = c(1:ncol(x)), I3 = NULL, post = TRUE, ...) { @@ -73,13 +74,15 @@ rlassologitEffects.default <- function(x, y, index = c(1:ncol(x)), I3 = NULL, po } } } - + # check validity of I3 I3ind <- which(I3 == T) - if (length(intersect(index, I3ind) != 0)) + if (length(intersect(index, I3ind) != 0)) { stop("I3 and index must not overlap!") - if (is.null(colnames(x))) + } + if (is.null(colnames(x))) { colnames(x) <- paste("V", 1:dim(x)[2], sep = "") + } coefficients <- as.vector(rep(NA_real_, k)) se <- rep(NA_real_, k) t <- rep(NA_real_, k) @@ -88,7 +91,7 @@ rlassologitEffects.default <- function(x, y, index = c(1:ncol(x)), I3 = NULL, po reside <- matrix(NA, nrow = n, ncol = p1) residv <- matrix(NA, nrow = n, ncol = p1) names(coefficients) <- names(se) <- names(t) <- names(pval) <- names(lasso.regs) <- colnames(reside) <- colnames(residv) <- colnames(x)[index] - + for (i in 1:k) { d <- x[, index[i], drop = FALSE] Xt <- x[, -index[i], drop = FALSE] @@ -101,14 +104,16 @@ rlassologitEffects.default <- function(x, y, index = c(1:ncol(x)), I3 = NULL, po se[i] <- col$se t[i] <- col$t pval[i] <- col$pval - reside[,i] <- col$residuals$epsilon - residv[,i] <- col$residuals$v + reside[, i] <- col$residuals$epsilon + residv[, i] <- col$residuals$v } } residuals <- list(e = reside, v = residv) - res <- list(coefficients = coefficients, se = se, t = t, pval = pval, - lasso.regs = lasso.regs, index = I, call = match.call(), samplesize = n, - residuals = residuals) + res <- list( + coefficients = coefficients, se = se, t = t, pval = pval, + lasso.regs = lasso.regs, index = I, call = match.call(), samplesize = n, + residuals = residuals + ) class(res) <- c("rlassologitEffects") return(res) } @@ -117,14 +122,14 @@ rlassologitEffects.default <- function(x, y, index = c(1:ncol(x)), I3 = NULL, po #' @rdname rlassologitEffects #' @export #' @param formula An element of class \code{formula} specifying the linear model. -#' @param data an optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. +#' @param data an optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. #' If not found in data, the variables are taken from environment(formula), typically the environment from which the function is called. #' @param I An one-sided formula specifying the variables for which inference is conducted. #' @param included One-sided formula of variables which should be included in any case. -rlassologitEffects.formula <- function(formula, data, I, - included = NULL, post = TRUE, ...) { +rlassologitEffects.formula <- function(formula, data, I, + included = NULL, post = TRUE, ...) { cl <- match.call() - if (missing(data)) data <- environment(formula) + if (missing(data)) data <- environment(formula) mf <- match.call(expand.dots = FALSE) m <- match(c("formula", "data"), names(mf), 0L) mf <- mf[c(1L, m)] @@ -135,15 +140,16 @@ rlassologitEffects.formula <- function(formula, data, I, attr(mt, "intercept") <- 1 y <- model.response(mf, "numeric") n <- length(y) - x <- model.matrix(mt, mf)[,-1, drop=FALSE] + x <- model.matrix(mt, mf)[, -1, drop = FALSE] cn <- attr(mt, "term.labels") - try(if (is.matrix(eval(parse(text=cn)))) cn <- colnames(eval(parse(text=cn))), silent = TRUE) + try(if (is.matrix(eval(parse(text = cn)))) cn <- colnames(eval(parse(text = cn))), silent = TRUE) I.c <- check_variables(I, cn) I3 <- check_variables(included, cn) - - if (length(intersect(I.c, I3) != 0)) + + if (length(intersect(I.c, I3) != 0)) { stop("I and included should not contain the same variables!") - + } + est <- rlassologitEffects(x, y, index = I.c, I3 = I3, post = post, ...) est$call <- cl return(est) @@ -157,28 +163,32 @@ rlassologitEffect <- function(x, y, d, I3 = NULL, post = TRUE) { y <- as.matrix(y, ncol = 1) kx <- p <- dim(x)[2] n <- dim(x)[1] - if (is.null(colnames(d))) + if (is.null(colnames(d))) { colnames(d) <- "d1" - if (is.null(colnames(x)) & !is.null(x)) + } + if (is.null(colnames(x)) & !is.null(x)) { colnames(x) <- paste("x", 1:kx, sep = "") + } # Step 1 - la1 <- 1.1/2 * sqrt(n) * qnorm(1 - 0.05/(max(n, (p + 1) * log(n)))) + la1 <- 1.1 / 2 * sqrt(n) * qnorm(1 - 0.05 / (max(n, (p + 1) * log(n)))) dx <- cbind(d, x) l1 <- rlassologit(y ~ dx, post = post, intercept = TRUE, penalty = list(lambda.start = la1)) xi <- l1$residuals t <- predict(l1, type = "link", newdata = dx) - sigma2 <- exp(t)/(1 + exp(t))^2 - w <- sigma2 #exp(t)/(1+exp(t))^2 - f <- sqrt(sigma2) #w/sigma2=1 + sigma2 <- exp(t) / (1 + exp(t))^2 + w <- sigma2 # exp(t)/(1+exp(t))^2 + f <- sqrt(sigma2) # w/sigma2=1 I1 <- l1$index[-1] # Step 2 - la2 <- rep(2.2 * sqrt(n) * qnorm(1 - 0.05/(max(n, p * log(n)))), p) + la2 <- rep(2.2 * sqrt(n) * qnorm(1 - 0.05 / (max(n, p * log(n)))), p) xf <- x * as.vector(f) df <- d * f - l2 <- rlasso(xf, df, post = post, intercept = TRUE, penalty = list(homoscedastic = "none", - lambda.start = la2, c = 1.1, gamma = 0.1)) + l2 <- rlasso(xf, df, post = post, intercept = TRUE, penalty = list( + homoscedastic = "none", + lambda.start = la2, c = 1.1, gamma = 0.1 + )) I2 <- l2$index - z <- l2$residual/sqrt(sigma2) + z <- l2$residual / sqrt(sigma2) # Step 3 if (is.logical(I3)) { I <- I1 + I2 + I3 @@ -189,16 +199,16 @@ rlassologitEffect <- function(x, y, d, I3 = NULL, post = TRUE) { } xselect <- x[, I] p3 <- dim(xselect)[2] - #la3 <- 1.1/2 * sqrt(n) * qnorm(1 - 0.05/(max(n, (p3 + 1) * log(n)))) - #l3 <- rlassologit(cbind(d, xselect), y, post = TRUE, normalize = TRUE, + # la3 <- 1.1/2 * sqrt(n) * qnorm(1 - 0.05/(max(n, (p3 + 1) * log(n)))) + # l3 <- rlassologit(cbind(d, xselect), y, post = TRUE, normalize = TRUE, # intercept = TRUE, penalty = list(lambda.start = la3)) - l3 <- glm(y ~ cbind(d, xselect),family=binomial(link='logit')) + l3 <- glm(y ~ cbind(d, xselect), family = binomial(link = "logit")) alpha <- l3$coef[2] names(alpha) <- colnames(d) t3 <- predict(l3, type = "link") - G3 <- exp(t3)/(1 + exp(t3)) + G3 <- exp(t3) / (1 + exp(t3)) w3 <- G3 * (1 - G3) - S21 <- 1/mean(w3 * d * z)^2 * mean((y - G3)^2 * z^2) + S21 <- 1 / mean(w3 * d * z)^2 * mean((y - G3)^2 * z^2) xtilde <- x[, l3$index[-1]] p2 <- sum(l3$index) + 1 b <- cbind(d, xtilde) @@ -207,10 +217,10 @@ rlassologitEffect <- function(x, y, d, I3 = NULL, post = TRUE) { for (i in 1:n) { A <- A + w3[i] * outer(b[i, ], b[i, ]) } - S22 <- solve(1/n * A)[1, 1] + S22 <- solve(1 / n * A)[1, 1] S2 <- max(S21, S22) - se <- sqrt(S2/n) - tval <- alpha/se + se <- sqrt(S2 / n) + tval <- alpha / se pval <- 2 * pnorm(-abs(tval)) if (is.null(I)) { no.selected <- 1 @@ -219,12 +229,14 @@ rlassologitEffect <- function(x, y, d, I3 = NULL, post = TRUE) { } # return(list(alpha=unname(alpha), se=drop(se), t=unname(tval), # pval=unname(pval), coefficients=coef(l3), residuals=l3$residuals)) - res <- list(epsilon= l3$residuals, v= z) + res <- list(epsilon = l3$residuals, v = z) se <- drop(se) names(se) <- colnames(d) - results <- list(alpha = alpha, se = se, t = tval, pval = pval, - no.selected = no.selected, coefficients = alpha, coefficient = alpha, - residuals = res, call = match.call(), samplesize = n, post = post) + results <- list( + alpha = alpha, se = se, t = tval, pval = pval, + no.selected = no.selected, coefficients = alpha, coefficient = alpha, + residuals = res, call = match.call(), samplesize = n, post = post + ) class(results) <- c("rlassologitEffects") return(results) } @@ -235,7 +247,7 @@ rlassologitEffect <- function(x, y, d, I3 = NULL, post = TRUE) { #' Methods for S3 object \code{rlassologitEffects} #' -#' Objects of class \code{rlassologitEffects} are construced by \code{rlassologitEffects} or \code{rlassologitEffect}. +#' Objects of class \code{rlassologitEffects} are construced by \code{rlassologitEffects} or \code{rlassologitEffect}. #' \code{print.rlassologitEffects} prints and displays some information about fitted \code{rlassologitEffect} objects. #' \code{summary.rlassologitEffects} summarizes information of a fitted \code{rlassologitEffects} object. #' \code{confint.rlassologitEffects} extracts the confidence intervals. @@ -247,15 +259,21 @@ rlassologitEffect <- function(x, y, d, I3 = NULL, post = TRUE) { #' @aliases methods.rlassologitEffects print.rlassologitEffects summary.rlassologitEffects confint.rlassologitEffects #' @export -print.rlassologitEffects <- function(x, digits = max(3L, getOption("digits") - - 3L), ...) { - cat("\nCall:\n", paste(deparse(x$call), sep = "\n", collapse = "\n"), - "\n\n", sep = "") +print.rlassologitEffects <- function(x, digits = max(3L, getOption("digits") - + 3L), ...) { + cat("\nCall:\n", paste(deparse(x$call), sep = "\n", collapse = "\n"), + "\n\n", + sep = "" + ) if (length(coef(x))) { cat("Coefficients:\n") - print.default(format(coef(x), digits = digits), print.gap = 2L, - quote = FALSE) - } else cat("No coefficients\n") + print.default(format(coef(x), digits = digits), + print.gap = 2L, + quote = FALSE + ) + } else { + cat("No coefficients\n") + } cat("\n") invisible(coef(x)) } @@ -264,8 +282,8 @@ print.rlassologitEffects <- function(x, digits = max(3L, getOption("digits") - #' @rdname methods.rlassologitEffects #' @export -summary.rlassologitEffects <- function(object, digits = max(3L, getOption("digits") - - 3L), ...) { +summary.rlassologitEffects <- function(object, digits = max(3L, getOption("digits") - + 3L), ...) { if (length(coef(object))) { k <- length(object$coefficients) table <- matrix(NA, ncol = 4, nrow = k) @@ -291,27 +309,31 @@ summary.rlassologitEffects <- function(object, digits = max(3L, getOption("digit #' @param joint logical, if joint confidence intervals should be clalculated #' @export -confint.rlassologitEffects <- function(object, parm, level = 0.95, joint = FALSE, +confint.rlassologitEffects <- function(object, parm, level = 0.95, joint = FALSE, ...) { - B <- 500 # number of bootstrap repitions + B <- 500 # number of bootstrap repitions n <- object$samplesize k <- p1 <- length(object$coefficient) cf <- coef(object) pnames <- names(cf) - if (missing(parm)) - parm <- pnames else if (is.numeric(parm)) - parm <- pnames[parm] + if (missing(parm)) { + parm <- pnames + } else if (is.numeric(parm)) { + parm <- pnames[parm] + } if (!joint) { - a <- (1 - level)/2 + a <- (1 - level) / 2 a <- c(a, 1 - a) fac <- qt(a, n - k) pct <- format.perc(a, 3) - ci <- array(NA, dim = c(length(parm), 2L), dimnames = list(parm, - pct)) + ci <- array(NA, dim = c(length(parm), 2L), dimnames = list( + parm, + pct + )) ses <- object$se[parm] ci[] <- cf[parm] + ses %o% fac } - + if (joint) { # phi <- object$residuals$e * object$residuals$v # m <- 1/sqrt(colMeans(phi^2)) @@ -326,41 +348,36 @@ confint.rlassologitEffects <- function(object, parm, level = 0.95, joint = FALSE # } e <- object$residuals$e v <- object$residuals$v - ev <- e*v + ev <- e * v Ev2 <- colMeans(v^2) Ee2v2 <- colMeans(ev^2) - Omegahat <- matrix(NA, ncol=k, nrow=k) + Omegahat <- matrix(NA, ncol = k, nrow = k) for (j in 1:k) { for (l in 1:k) { - Omegahat[j,l] = Omegahat[l,j] = 1/(Ev2[j]*Ev2[l]) * mean(ev[,j]*ev[,l]) + Omegahat[j, l] <- Omegahat[l, j] <- 1 / (Ev2[j] * Ev2[l]) * mean(ev[, j] * ev[, l]) } } var <- diag(Omegahat) names(var) <- names(cf) - Beta <- matrix(NA, ncol=B, nrow=k) + Beta <- matrix(NA, ncol = B, nrow = k) sim <- vector("numeric", length = B) for (i in 1:B) { - beta_i <- MASS::mvrnorm(mu = rep(0,k), Sigma=Omegahat/n) - sim[i] <- max(abs(sqrt(n)*beta_i/var)) + beta_i <- MASS::mvrnorm(mu = rep(0, k), Sigma = Omegahat / n) + sim[i] <- max(abs(sqrt(n) * beta_i / var)) } - a <- (1 - level) #not dividing by 2! - ab <- c(a/2, 1 - a/2) - pct <- format.perc(ab, 3) - ci <- array(NA, dim = c(length(parm), 2L), dimnames = list(parm, - pct)) + a <- (1 - level) # not dividing by 2! + ab <- c(a / 2, 1 - a / 2) + pct <- format.perc(ab, 3) + ci <- array(NA, dim = c(length(parm), 2L), dimnames = list( + parm, + pct + )) # hatc <- quantile(sim, probs = 1 - a) # ci[, 1] <- cf[parm] - hatc * 1/sqrt(n) * sigma # ci[, 2] <- cf[parm] + hatc * 1/sqrt(n) * sigma hatc <- quantile(sim, probs = 1 - a) - ci[, 1] <- cf[parm] - hatc * 1/sqrt(n) * sqrt(var[parm]) - ci[, 2] <- cf[parm] + hatc * 1/sqrt(n) * sqrt(var[parm]) + ci[, 1] <- cf[parm] - hatc * 1 / sqrt(n) * sqrt(var[parm]) + ci[, 2] <- cf[parm] + hatc * 1 / sqrt(n) * sqrt(var[parm]) } return(ci) } - - - - - - - diff --git a/R/rlassotreatment.R b/R/rlassotreatment.R index 4558934..647b22b 100644 --- a/R/rlassotreatment.R +++ b/R/rlassotreatment.R @@ -30,8 +30,9 @@ #' (2014). Program evaluation with high-dimensional data. Working Paper. #' @rdname TE #' @export -rlassoATE <- function(x, ...) - UseMethod("rlassoATE") # definition generic function +rlassoATE <- function(x, ...) { + UseMethod("rlassoATE") +} # definition generic function #' @rdname TE #' @export @@ -46,7 +47,7 @@ rlassoATE.default <- function(x, d, y, bootstrap = "none", nRep = 500, ...) { #' @export #' @param formula An object of class \code{Formula} of the form " y ~ x + d | x" with y the outcome variable, #' d treatment variable, and x exogenous variables. -#' @param data An optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. +#' @param data An optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. #' If not found in data, the variables are taken from environment(formula), typically the environment from which \code{rlassoATE} is called. rlassoATE.formula <- function(formula, data, bootstrap = "none", nRep = 500, ...) { mat <- f.formula(formula, data, all.categories = FALSE) @@ -54,22 +55,25 @@ rlassoATE.formula <- function(formula, data, bootstrap = "none", nRep = 500, ... x <- mat$X d <- mat$D check_binary(d) - res <- rlassoATE(x=x, d=d, y=y, bootstrap = bootstrap, nRep = nRep, ...) + res <- rlassoATE(x = x, d = d, y = y, bootstrap = bootstrap, nRep = nRep, ...) res$call <- match.call() return(res) } #' @export #' @rdname TE -rlassoATET <- function(x, ...) - UseMethod("rlassoATET") # definition generic function +rlassoATET <- function(x, ...) { + UseMethod("rlassoATET") +} # definition generic function #' @export #' @rdname TE rlassoATET.default <- function(x, d, y, bootstrap = "none", nRep = 500, ...) { z <- d - res <- rlassoLATET(x, d, y, z, bootstrap = bootstrap, nRep = nRep, - ...) + res <- rlassoLATET(x, d, y, z, + bootstrap = bootstrap, nRep = nRep, + ... + ) res$type <- "ATET" return(res) } @@ -78,7 +82,7 @@ rlassoATET.default <- function(x, d, y, bootstrap = "none", nRep = 500, ...) { #' @export # #' @param formula An object of class \code{Formula} of the form " y ~ x + d | x" with y the outcome variable, # #' d treatment variable, and x exogenous variables. -# #' @param data An optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. +# #' @param data An optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. # #' If not found in data, the variables are taken from environment(formula), typically the environment from which \code{rlassoATET} is called. rlassoATET.formula <- function(formula, data, bootstrap = "none", nRep = 500, ...) { mat <- f.formula(formula, data, all.categories = FALSE) @@ -86,7 +90,7 @@ rlassoATET.formula <- function(formula, data, bootstrap = "none", nRep = 500, .. x <- mat$X d <- mat$D check_binary(d) - res <- rlassoATET(x=x, d=d, y=y, bootstrap = bootstrap, nRep = nRep, ...) + res <- rlassoATET(x = x, d = d, y = y, bootstrap = bootstrap, nRep = nRep, ...) res$call <- match.call() return(res) } @@ -94,117 +98,129 @@ rlassoATET.formula <- function(formula, data, bootstrap = "none", nRep = 500, .. #' @export #' @rdname TE -rlassoLATE <- function(x, ...) - UseMethod("rlassoLATE") # definition generic function +rlassoLATE <- function(x, ...) { + UseMethod("rlassoLATE") +} # definition generic function #' @export #' @param post logical. If \code{TRUE}, post-lasso estimation is conducted. #' @param intercept logical. If \code{TRUE}, intercept is included which is not #' @rdname TE -rlassoLATE.default <- function(x, d, y, z, bootstrap = "none", nRep = 500, post = TRUE, - intercept = TRUE, always_takers = TRUE, never_takers = TRUE, ...) { +rlassoLATE.default <- function(x, d, y, z, bootstrap = "none", nRep = 500, post = TRUE, + intercept = TRUE, always_takers = TRUE, never_takers = TRUE, ...) { x <- as.matrix(x) n <- dim(x)[1] p <- dim(x)[2] checkmate::checkChoice(bootstrap, c("none", "Bayes", "normal", "wild")) checkmate::checkLogical(always_takers, never_takers) - lambda <- 2.2 * sqrt(n) * qnorm(1 - (.1/log(n))/(2 * (2 * p))) + lambda <- 2.2 * sqrt(n) * qnorm(1 - (.1 / log(n)) / (2 * (2 * p))) control <- list(numIter = 15, tol = 10^-5) # penalty <- list(method = 'none', lambda.start = rep(lambda, p), c = # 1.1, gamma = 0.1) - penalty <- list(homoscedastic = "none", lambda.start = rep(lambda, - p), c = 1.1, gamma = 0.1) + penalty <- list(homoscedastic = "none", lambda.start = rep( + lambda, + p + ), c = 1.1, gamma = 0.1) indz1 <- (z == 1) indz0 <- (z == 0) # E[Y|Z = 1,X] = my_z1x - b_y_z1xL <- rlasso(y[indz1] ~ x[indz1, , drop = FALSE], post = post, - intercept = intercept, control = control, penalty = penalty) + b_y_z1xL <- rlasso(y[indz1] ~ x[indz1, , drop = FALSE], + post = post, + intercept = intercept, control = control, penalty = penalty + ) my_z1x <- predict(b_y_z1xL, newdata = x) # E[Y|Z = 0,X] = my_z0x - b_y_z0xL <- rlasso(y[indz0] ~ x[indz0, , drop = FALSE], post = post, - intercept = intercept, control = control, penalty = penalty) + b_y_z0xL <- rlasso(y[indz0] ~ x[indz0, , drop = FALSE], + post = post, + intercept = intercept, control = control, penalty = penalty + ) my_z0x <- predict(b_y_z0xL, newdata = x) # E[D|Z = 1,X] = md_z1x - lambda <- 2.2 * sqrt(n) * qnorm(1 - (.1/log(n))/(2 * (2 * p))) + lambda <- 2.2 * sqrt(n) * qnorm(1 - (.1 / log(n)) / (2 * (2 * p))) penalty <- list(lambda.start = lambda, c = 1.1, gamma = 0.1) - + # if (sum(d - z) != 0) { - # b_d_z1xL <- rlassologit(d[indz1] ~ x[indz1, , drop = FALSE], post = post, + # b_d_z1xL <- rlassologit(d[indz1] ~ x[indz1, , drop = FALSE], post = post, # intercept = intercept, penalty = penalty) # md_z1x <- predict(b_d_z1xL, newdata = x) # } else { # md_z1x <- rep(1, n) # } - # + # # E[D|Z = 0,X] = md_z0x - #penalty <- list(homoscedastic = "none", lambda.start = rep(lambda, + # penalty <- list(homoscedastic = "none", lambda.start = rep(lambda, # p), c = 1.1, gamma = 0.1) - #b_d_z0x <- rlassologit(d[indz0] ~ x[indz0, ], post = post, intercept = intercept, penalty = penalty) - #md_z0x <- predict(b_d_z0x, newdata = x) + # b_d_z0x <- rlassologit(d[indz0] ~ x[indz0, ], post = post, intercept = intercept, penalty = penalty) + # md_z0x <- predict(b_d_z0x, newdata = x) # md_z0x <- rep(0, n) - + # E[D|Z = 1,X] = md_z1x - if (identical(d,z)) { + if (identical(d, z)) { md_z1x <- rep(1, n) md_z0x <- rep(0, n) } - - else { - + + else { if (all(always_takers, never_takers)) { - g_d_z1 <- rlassologit(d[indz1] ~ x[indz1, , drop = FALSE], - post = post, intercept = intercept, penalty = penalty) - md_z1x <- predict(g_d_z1, newdata = x) - - g_d_z0 <- rlassologit(d[indz0] ~ x[indz0, , drop = FALSE], - post = post, intercept = intercept, penalty = penalty) - md_z0x <- predict(g_d_z0, newdata = x) - } - - + g_d_z1 <- rlassologit(d[indz1] ~ x[indz1, , drop = FALSE], + post = post, intercept = intercept, penalty = penalty + ) + md_z1x <- predict(g_d_z1, newdata = x) + + g_d_z0 <- rlassologit(d[indz0] ~ x[indz0, , drop = FALSE], + post = post, intercept = intercept, penalty = penalty + ) + md_z0x <- predict(g_d_z0, newdata = x) + } + + if (always_takers == FALSE & never_takers == TRUE) { - g_d_z1 <- rlassologit(d[indz1] ~ x[indz1, , drop = FALSE], - post = post, intercept = intercept, penalty = penalty) + g_d_z1 <- rlassologit(d[indz1] ~ x[indz1, , drop = FALSE], + post = post, intercept = intercept, penalty = penalty + ) md_z1x <- predict(g_d_z1, newdata = x) - + ### no always-takers: E[D=1 | Z=0,X] = 0 md_z0x <- rep(0, n) } - + if (always_takers == TRUE & never_takers == FALSE) { ### no never-takers: E[D=1 | Z=1,X] = 1 md_z1x <- rep(1, n) - - g_d_z0 <- rlassologit(d[indz0] ~ x[indz0, , drop = FALSE], - post = post, intercept = intercept, penalty = penalty) + + g_d_z0 <- rlassologit(d[indz0] ~ x[indz0, , drop = FALSE], + post = post, intercept = intercept, penalty = penalty + ) md_z0x <- predict(g_d_z0, newdata = x) } - + if (always_takers == FALSE & never_takers == FALSE) { md_z1x <- rep(1, n) md_z0x <- rep(0, n) - - message("If there are no always-takers and no never-takers, ATE is estimated") + + message("If there are no always-takers and no never-takers, ATE is estimated") } } - + # E[Z|X] = mz_x b_z_xL <- rlassologit(z ~ x, post = post, intercept = intercept) mz_x <- predict(b_z_xL, newdata = x) - mz_x <- mz_x * (mz_x > 1e-12 & mz_x < 1 - 1e-12) + (1 - 1e-12) * (mz_x > - 1 - 1e-12) + 1e-12 * (mz_x < 1e-12) - - eff <- (z * (y - my_z1x)/mz_x - ((1 - z) * (y - my_z0x)/(1 - mz_x)) + - my_z1x - my_z0x)/mean(z * (d - md_z1x)/mz_x - ((1 - z) * (d - md_z0x)/(1 - - mz_x)) + md_z1x - md_z0x) - - se <- sqrt(var(eff))/sqrt(n) + mz_x <- mz_x * (mz_x > 1e-12 & mz_x < 1 - 1e-12) + (1 - 1e-12) * (mz_x > + 1 - 1e-12) + 1e-12 * (mz_x < 1e-12) + + eff <- (z * (y - my_z1x) / mz_x - ((1 - z) * (y - my_z0x) / (1 - mz_x)) + + my_z1x - my_z0x) / mean(z * (d - md_z1x) / mz_x - ((1 - z) * (d - md_z0x) / (1 - + mz_x)) + md_z1x - md_z0x) + + se <- sqrt(var(eff)) / sqrt(n) late <- mean(eff) individual <- eff - - object <- list(se = se, te = late, individual = individual, type = "LATE", - call = match.call(), samplesize = n) - + + object <- list( + se = se, te = late, individual = individual, type = "LATE", + call = match.call(), samplesize = n + ) + if (bootstrap != "none") { boot <- rep(NA, nRep) for (i in 1:nRep) { @@ -215,13 +231,13 @@ rlassoLATE.default <- function(x, d, y, z, bootstrap = "none", nRep = 500, post weights <- rnorm(n) } if (bootstrap == "wild") { - weights <- rnorm(n)/sqrt(2) + (rnorm(n)^2 - 1)/2 + weights <- rnorm(n) / sqrt(2) + (rnorm(n)^2 - 1) / 2 } weights <- weights + 1 - boot[i] <- mean(weights * (z * (y - my_z1x)/mz_x - ((1 - z) * - (y - my_z0x)/(1 - mz_x)) + my_z1x - my_z0x))/mean(weights * - (z * (d - md_z1x)/mz_x - ((1 - z) * (d - md_z0x)/(1 - mz_x)) + - md_z1x - md_z0x)) + boot[i] <- mean(weights * (z * (y - my_z1x) / mz_x - ((1 - z) * + (y - my_z0x) / (1 - mz_x)) + my_z1x - my_z0x)) / mean(weights * + (z * (d - md_z1x) / mz_x - ((1 - z) * (d - md_z0x) / (1 - mz_x)) + + md_z1x - md_z0x)) } object$boot.se <- sqrt(var(boot)) object$type_boot <- bootstrap @@ -236,10 +252,10 @@ rlassoLATE.default <- function(x, d, y, z, bootstrap = "none", nRep = 500, post #' @export # #' @param formula An object of class \code{Formula} of the form " y ~ x + d | x + z" with y the outcome variable, # #' d endogenous variable, z instrumental variables, and x exogenous variables. -# #' @param data An optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. +# #' @param data An optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. # #' If not found in data, the variables are taken from environment(formula), typically the environment from which \code{rlassoLATE} is called. - rlassoLATE.formula <- function(formula, data, bootstrap = "none", nRep = 500, post = TRUE, intercept = TRUE, - always_takers = TRUE, never_takers = TRUE,...) { +rlassoLATE.formula <- function(formula, data, bootstrap = "none", nRep = 500, post = TRUE, intercept = TRUE, + always_takers = TRUE, never_takers = TRUE, ...) { mat <- f.formula(formula, data, all.categories = FALSE, ...) y <- mat$Y x <- mat$X @@ -247,87 +263,98 @@ rlassoLATE.default <- function(x, d, y, z, bootstrap = "none", nRep = 500, post z <- mat$Z check_binary(d) check_binary(z) - res <- rlassoLATE(x=x, d=d, y=y, z=z, bootstrap = bootstrap, nRep = nRep, post = post, - intercept = intercept, always_takers = always_takers, never_takers = never_takers) + res <- rlassoLATE( + x = x, d = d, y = y, z = z, bootstrap = bootstrap, nRep = nRep, post = post, + intercept = intercept, always_takers = always_takers, never_takers = never_takers + ) res$call <- match.call() return(res) } #' @export #' @rdname TE -rlassoLATET <- function(x, ...) - UseMethod("rlassoLATET") # definition generic function - +rlassoLATET <- function(x, ...) { + UseMethod("rlassoLATET") +} # definition generic function + #' @export #' @rdname TE -rlassoLATET.default <- function(x, d, y, z, bootstrap = "none", nRep = 500, post = TRUE, - intercept = TRUE, always_takers = TRUE, ...) { +rlassoLATET.default <- function(x, d, y, z, bootstrap = "none", nRep = 500, post = TRUE, + intercept = TRUE, always_takers = TRUE, ...) { x <- as.matrix(x) n <- dim(x)[1] p <- dim(x)[2] checkmate::checkChoice(bootstrap, c("none", "Bayes", "normal", "wild")) - lambda <- 2.2 * sqrt(n) * qnorm(1 - (.1/log(n))/(2 * (2 * p))) + lambda <- 2.2 * sqrt(n) * qnorm(1 - (.1 / log(n)) / (2 * (2 * p))) control <- list(numIter = 15, tol = 10^-5) # penalty <- list(method = 'none', lambda.start = rep(lambda, p), c = # 1.1, gamma = 0.1) - penalty <- list(homoscedastic = "none", lambda.start = rep(lambda, - p), c = 1.1, gamma = 0.1) + penalty <- list(homoscedastic = "none", lambda.start = rep( + lambda, + p + ), c = 1.1, gamma = 0.1) indz1 <- (z == 1) indz0 <- (z == 0) # E[Y|Z = 0,X] = my_z0x - b_y_z0xL <- rlasso(y[indz0] ~ x[indz0, ], post = post, intercept = intercept, - control = control, penalty = penalty) + b_y_z0xL <- rlasso(y[indz0] ~ x[indz0, ], + post = post, intercept = intercept, + control = control, penalty = penalty + ) my_z0x <- predict(b_y_z0xL, newdata = x) - + # E[D|Z = 0,X] = md_z0x - #penalty <- list(homoscedastic = "none", lambda.start = rep(lambda, + # penalty <- list(homoscedastic = "none", lambda.start = rep(lambda, # p), c = 1.1, gamma = 0.1) - #b_d_z0x <- rlassologit(d[indz0] ~ x[indz0, ], post = post, intercept = intercept, penalty = penalty) - #md_z0x <- predict(b_d_z0x, newdata = x) - - #if (sum(d - z) == 0) { - if (identical(d,z)) { + # b_d_z0x <- rlassologit(d[indz0] ~ x[indz0, ], post = post, intercept = intercept, penalty = penalty) + # md_z0x <- predict(b_d_z0x, newdata = x) + + # if (sum(d - z) == 0) { + if (identical(d, z)) { # md_z1x <- rep(1, n) md_z0x <- rep(0, n) } - - else { - if (always_takers == TRUE) { - g_d_z0 <- rlassologit(d[indz0] ~ x[indz0, , drop = FALSE], - post = post, intercept = intercept, penalty = penalty) - md_z0x <- predict(g_d_z0, newdata = x) - } - - if (always_takers == FALSE){ - md_z0x <- rep(0, n) - } - + + else { + if (always_takers == TRUE) { + g_d_z0 <- rlassologit(d[indz0] ~ x[indz0, , drop = FALSE], + post = post, intercept = intercept, penalty = penalty + ) + md_z0x <- predict(g_d_z0, newdata = x) + } + + if (always_takers == FALSE) { + md_z0x <- rep(0, n) + } } - + # E[Z|X] = mz_x - lambdaP <- 2.2 * sqrt(n) * qnorm(1 - (.1/log(n))/(2 * p)) - # penalty <- list(lambda.start = lambdaP, c = 1.1, gamma = 0.1) - #penalty <- list(homoscedastic = "none", lambda.start = p, c = 1.1, + lambdaP <- 2.2 * sqrt(n) * qnorm(1 - (.1 / log(n)) / (2 * p)) + # penalty <- list(lambda.start = lambdaP, c = 1.1, gamma = 0.1) + # penalty <- list(homoscedastic = "none", lambda.start = p, c = 1.1, # gamma = 0.1) - penalty <- list(homoscedastic = "none", lambda.start = rep(lambda, - p), c = 1.1, gamma = 0.1) + penalty <- list(homoscedastic = "none", lambda.start = rep( + lambda, + p + ), c = 1.1, gamma = 0.1) b_z_xL <- rlassologit(z ~ x, post = post, intercept = intercept, penalty = penalty) mz_x <- predict(b_z_xL, newdata = x) - mz_x <- mz_x * (mz_x > 1e-12 & mz_x < 1 - 1e-12) + (1 - 1e-12) * (mz_x > - 1 - 1e-12) + 1e-12 * (mz_x < 1e-12) - - - ## - eff <- ((y - my_z0x) - (1 - z) * (y - my_z0x)/(1 - mz_x))/mean((d - - md_z0x) - (1 - z) * (d - md_z0x)/(1 - mz_x)) - - se <- sqrt(var(eff))/sqrt(n) + mz_x <- mz_x * (mz_x > 1e-12 & mz_x < 1 - 1e-12) + (1 - 1e-12) * (mz_x > + 1 - 1e-12) + 1e-12 * (mz_x < 1e-12) + + + ## + eff <- ((y - my_z0x) - (1 - z) * (y - my_z0x) / (1 - mz_x)) / mean((d - + md_z0x) - (1 - z) * (d - md_z0x) / (1 - mz_x)) + + se <- sqrt(var(eff)) / sqrt(n) latet <- mean(eff) individual <- eff - - object <- list(se = se, te = latet, individual = individual, type = "LATET", - call = match.call(), samplesize = n) - + + object <- list( + se = se, te = latet, individual = individual, type = "LATET", + call = match.call(), samplesize = n + ) + if (bootstrap != "none") { boot <- rep(NA, nRep) for (i in 1:nRep) { @@ -338,16 +365,16 @@ rlassoLATET.default <- function(x, d, y, z, bootstrap = "none", nRep = 500, post weights <- rnorm(n) } if (bootstrap == "wild") { - weights <- rnorm(n)/sqrt(2) + (rnorm(n)^2 - 1)/2 + weights <- rnorm(n) / sqrt(2) + (rnorm(n)^2 - 1) / 2 } weights <- weights + 1 # boot[i] <- mean(weights*(z*(y-my_z1x)/mz_x - # ((1-z)*(y-my_z0x)/(1-mz_x)) + my_z1x - my_z0x))/ # mean(weights*(z*(d-md_z1x)/mz_x - ((1-z)*(d-md_z0x)/(1-mz_x)) + # md_z1x - md_z0x)) - boot[i] <- mean(weights * ((y - my_z0x) - (1 - z) * (y - my_z0x)/(1 - - mz_x)))/mean(weights * ((d - md_z0x) - (1 - z) * (d - md_z0x)/(1 - - mz_x))) + boot[i] <- mean(weights * ((y - my_z0x) - (1 - z) * (y - my_z0x) / (1 - + mz_x))) / mean(weights * ((d - md_z0x) - (1 - z) * (d - md_z0x) / (1 - + mz_x))) } object$boot.se <- sqrt(var(boot)) object$type_boot <- bootstrap @@ -361,10 +388,10 @@ rlassoLATET.default <- function(x, d, y, z, bootstrap = "none", nRep = 500, post #' @export # #' @param formula An object of class \code{Formula} of the form " y ~ x + d | x + z" with y the outcome variable, # #' d endogenous variable, z instrumental variables, and x exogenous variables. -# #' @param data An optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. +# #' @param data An optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. # #' If not found in data, the variables are taken from environment(formula), typically the environment from which \code{rlassoLATE} is called. -rlassoLATET.formula <- function(formula, data, bootstrap = "none", nRep = 500, post = TRUE, - intercept = TRUE, always_takers = TRUE, ...) { +rlassoLATET.formula <- function(formula, data, bootstrap = "none", nRep = 500, post = TRUE, + intercept = TRUE, always_takers = TRUE, ...) { mat <- f.formula(formula, data, all.categories = FALSE) y <- mat$Y x <- mat$X @@ -372,8 +399,10 @@ rlassoLATET.formula <- function(formula, data, bootstrap = "none", nRep = 500, z <- mat$Z check_binary(d) check_binary(z) - res <- rlassoLATET(x=x, d=d, y=y, z=z, bootstrap = bootstrap, nRep = nRep, post = post, - intercept = intercept,always_takers = always_takers) + res <- rlassoLATET( + x = x, d = d, y = y, z = z, bootstrap = bootstrap, nRep = nRep, post = post, + intercept = intercept, always_takers = always_takers + ) res$call <- match.call() return(res) } @@ -396,16 +425,20 @@ rlassoLATET.formula <- function(formula, data, bootstrap = "none", nRep = 500, #' @aliases methods.rlassoTE print.rlassoTE summary.rlassoTE #' @export -print.rlassoTE <- function(x, digits = max(3L, getOption("digits") - 3L), +print.rlassoTE <- function(x, digits = max(3L, getOption("digits") - 3L), ...) { - cat("\nCall:\n", paste(deparse(x$call), sep = "\n", collapse = "\n"), - "\n\n", sep = "") + cat("\nCall:\n", paste(deparse(x$call), sep = "\n", collapse = "\n"), + "\n\n", + sep = "" + ) if (length(x$te)) { cat("Treatment Effect\n") cat(paste("Type:", x$type), "\n") cat("Value:\n") print.default(format(x$te, digits = digits), print.gap = 2L, quote = FALSE) - } else cat("No treatment effect\n") + } else { + cat("No treatment effect\n") + } cat("\n") invisible(x$te) } @@ -413,8 +446,8 @@ print.rlassoTE <- function(x, digits = max(3L, getOption("digits") - 3L), #' @rdname methods.rlassoTE #' @export -summary.rlassoTE <- function(object, digits = max(3L, getOption("digits") - - 3L), ...) { +summary.rlassoTE <- function(object, digits = max(3L, getOption("digits") - + 3L), ...) { if (length(object$te)) { table <- matrix(NA, ncol = 4, nrow = 1) rownames(table) <- "TE" @@ -425,13 +458,16 @@ summary.rlassoTE <- function(object, digits = max(3L, getOption("digits") - } else { table[, 2] <- object$boot.se } - table[, 3] <- table[, 1]/table[, 2] + table[, 3] <- table[, 1] / table[, 2] table[, 4] <- 2 * pnorm(-abs(table[, 3])) - cat("Estimation and significance testing of the treatment effect", - "\n") + cat( + "Estimation and significance testing of the treatment effect", + "\n" + ) cat(paste("Type:", object$type), "\n") - cat(paste("Bootstrap:", ifelse(is.null(object$type_boot), "not applicable", - object$type_boot)), "\n") + cat(paste("Bootstrap:", ifelse(is.null(object$type_boot), "not applicable", + object$type_boot + )), "\n") printCoefmat(table, digits = digits, P.values = TRUE, has.Pvalue = TRUE) cat("\n") } else { @@ -449,12 +485,14 @@ confint.rlassoTE <- function(object, parm, level = 0.95, ...) { k <- 1 cf <- object$te pnames <- "TE" - a <- (1 - level)/2 + a <- (1 - level) / 2 a <- c(a, 1 - a) fac <- qt(a, n - k) pct <- format.perc(a, 3) - ci <- array(NA, dim = c(length(pnames), 2L), dimnames = list(pnames, - pct)) + ci <- array(NA, dim = c(length(pnames), 2L), dimnames = list( + pnames, + pct + )) if (is.null(object$type_boot)) { ses <- object$se } else { diff --git a/R/tsls.R b/R/tsls.R index d6b6a85..a1ca484 100644 --- a/R/tsls.R +++ b/R/tsls.R @@ -4,7 +4,7 @@ #' #' The function computes tsls estimate (coefficients) and variance-covariance-matrix assuming homoskedasticity #' for outcome variable \code{y} where \code{d} are endogenous variables in structural equation, \code{x} are exogensous variables in -#' structural equation and z are instruments. It returns an object of class \code{tsls} for which the methods \code{print} and \code{summary} +#' structural equation and z are instruments. It returns an object of class \code{tsls} for which the methods \code{print} and \code{summary} #' are provided. #' #' @param y outcome variable @@ -19,31 +19,32 @@ #' \item{se}{standard error} #' @rdname tsls #' @export -tsls <- function(x, ...) - UseMethod("tsls") # definition generic function +tsls <- function(x, ...) { + UseMethod("tsls") +} # definition generic function #' @rdname tsls #' @export -tsls.default <- function(x, d, y, z, intercept=TRUE, homoscedastic=TRUE, ...) { +tsls.default <- function(x, d, y, z, intercept = TRUE, homoscedastic = TRUE, ...) { n <- length(y) - + d <- as.matrix(d) - if (!is.null(x)) x <- as.matrix(x) + if (!is.null(x)) x <- as.matrix(x) z <- as.matrix(z) - if (is.null(colnames(d)) & is.matrix(d)) colnames(d) <- paste("d", 1:ncol(d), sep="") - if (is.null(colnames(x)) & !is.null(x) & is.matrix(x)) colnames(x) <- paste("x", 1:ncol(x), sep="") - if (is.null(colnames(z)) & !is.null(z) & is.matrix(z)) colnames(z) <- paste("z", 1:ncol(z), sep="") - - if (intercept==TRUE && is.null(x)) { - x <- as.matrix(rep(1,n)) + if (is.null(colnames(d)) & is.matrix(d)) colnames(d) <- paste("d", 1:ncol(d), sep = "") + if (is.null(colnames(x)) & !is.null(x) & is.matrix(x)) colnames(x) <- paste("x", 1:ncol(x), sep = "") + if (is.null(colnames(z)) & !is.null(z) & is.matrix(z)) colnames(z) <- paste("z", 1:ncol(z), sep = "") + + if (intercept == TRUE && is.null(x)) { + x <- as.matrix(rep(1, n)) colnames(x) <- "(Intercept)" } else { - if (intercept==TRUE) { - x <- as.matrix(cbind(1,x)) + if (intercept == TRUE) { + x <- as.matrix(cbind(1, x)) colnames(x)[1] <- "(Intercept)" } } - + a1 <- dim(d)[2] a2 <- dim(x)[2] if (is.null(x)) { @@ -61,29 +62,29 @@ tsls.default <- function(x, d, y, z, intercept=TRUE, homoscedastic=TRUE, ...) { Mxz <- t(X) %*% Z Mzz <- solve(t(Z) %*% Z) - #Mzz <- MASS::ginv(t(Z) %*% Z) + # Mzz <- MASS::ginv(t(Z) %*% Z) M <- solve(Mxz %*% Mzz %*% t(Mxz)) - #M <- MASS::ginv(Mxz %*% Mzz %*% t(Mxz)) - + # M <- MASS::ginv(Mxz %*% Mzz %*% t(Mxz)) + b <- M %*% Mxz %*% Mzz %*% (t(Z) %*% y) - #Dhat <- Z %*% MASS::ginv(t(Z) %*% Z) %*% t(Z) %*% X - #b2 <- MASS::ginv(t(Dhat) %*% X) %*% (t(Dhat) %*% y) - if (homoscedastic==TRUE) { - e <- y - X %*% b - #VC1 <- as.numeric((t(e) %*% e/(n - k))) * M - VC1 <- as.numeric((sum(e^2)/(n - k))) * M + # Dhat <- Z %*% MASS::ginv(t(Z) %*% Z) %*% t(Z) %*% X + # b2 <- MASS::ginv(t(Dhat) %*% X) %*% (t(Dhat) %*% y) + if (homoscedastic == TRUE) { + e <- y - X %*% b + # VC1 <- as.numeric((t(e) %*% e/(n - k))) * M + VC1 <- as.numeric((sum(e^2) / (n - k))) * M } - if (homoscedastic==FALSE) { + if (homoscedastic == FALSE) { e <- y - X %*% b S <- 0 for (i in 1:n) { - S <- S + e[i]^2*(Z[i,]%*%t(Z[i,])) + S <- S + e[i]^2 * (Z[i, ] %*% t(Z[i, ])) } - S <- S*1/n - VC1 <- n*M%*%(Mxz%*%Mzz%*%S%*%Mzz%*%t(Mxz))%*%M + S <- S * 1 / n + VC1 <- n * M %*% (Mxz %*% Mzz %*% S %*% Mzz %*% t(Mxz)) %*% M } rownames(b) <- colnames(VC1) <- rownames(VC1) <- c(colnames(d), colnames(x)) - res <- list(coefficients = b, vcov = VC1, se=sqrt(diag(VC1)), residuals = e, call=match.call(), samplesize=n) + res <- list(coefficients = b, vcov = VC1, se = sqrt(diag(VC1)), residuals = e, call = match.call(), samplesize = n) class(res) <- "tsls" return(res) } @@ -93,16 +94,16 @@ tsls.default <- function(x, d, y, z, intercept=TRUE, homoscedastic=TRUE, ...) { #' @export #' @param formula An object of class \code{Formula} of the form " y ~ x + d | x + z" with y the outcome variable, #' d endogenous variable, z instrumental variables, and x exogenous variables. -#' @param data An optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. +#' @param data An optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. #' If not found in data, the variables are taken from environment(formula), typically the environment from which \code{tsls} is called. -tsls.formula <- function(formula, data, intercept=TRUE, homoscedastic=TRUE, ...) { - if (missing(data)) data <- environment(formula) +tsls.formula <- function(formula, data, intercept = TRUE, homoscedastic = TRUE, ...) { + if (missing(data)) data <- environment(formula) mat <- f.formula(formula, data, all.categories = FALSE) y <- mat$Y x <- mat$X d <- mat$D z <- mat$Z - res <- tsls(y=y, d=d, x=x, z=z, intercept=intercept, homoscedastic=homoscedastic) + res <- tsls(y = y, d = d, x = x, z = z, intercept = intercept, homoscedastic = homoscedastic) res$call <- match.call() return(res) } @@ -110,7 +111,7 @@ tsls.formula <- function(formula, data, intercept=TRUE, homoscedastic=TRUE, ...) ################# Methods for tsls #' Methods for S3 object \code{tsls} #' -#' Objects of class \code{tsls} are constructed by \code{tsls}. +#' Objects of class \code{tsls} are constructed by \code{tsls}. #' \code{print.tsls} prints and displays some information about fitted \code{tsls} objects. #' \code{summary.tsls} summarizes information of a fitted \code{tsls} object. #' @param x an object of class \code{tsls} @@ -120,17 +121,23 @@ tsls.formula <- function(formula, data, intercept=TRUE, homoscedastic=TRUE, ...) #' @aliases methods.tsls print.tsls summary.tsls #' @export -print.tsls <- function(x, digits = max(3L, getOption("digits") - - 3L), ...) { - cat("\nCall:\n", paste(deparse(x$call), sep = "\n", collapse = "\n"), - "\n\n", sep = "") +print.tsls <- function(x, digits = max(3L, getOption("digits") - + 3L), ...) { + cat("\nCall:\n", paste(deparse(x$call), sep = "\n", collapse = "\n"), + "\n\n", + sep = "" + ) if (length(coef(x))) { coeffs <- as.matrix(coef(x)) colnames(coeffs) <- "Estimate" cat("Coefficients:\n") - print.default(format(coeffs, digits = digits), print.gap = 2L, - quote = FALSE) - } else cat("No coefficients\n") + print.default(format(coeffs, digits = digits), + print.gap = 2L, + quote = FALSE + ) + } else { + cat("No coefficients\n") + } cat("\n") invisible(coef(x)) } @@ -139,16 +146,16 @@ print.tsls <- function(x, digits = max(3L, getOption("digits") - #' @rdname methods.tsls #' @export -summary.tsls <- function(object, digits = max(3L, getOption("digits") - - 3L), ...) { +summary.tsls <- function(object, digits = max(3L, getOption("digits") - + 3L), ...) { if (length(coef(object))) { k <- length(object$coefficient) table <- matrix(NA, ncol = 4, nrow = k) - rownames(table) <- dimnames(object$coefficients)[[1]] #names(object$coefficient) + rownames(table) <- dimnames(object$coefficients)[[1]] # names(object$coefficient) colnames(table) <- c("Estimate", "Std. Error", "t value", "p value") table[, 1] <- object$coefficient table[, 2] <- sqrt(diag(as.matrix(object$vcov))) - table[, 3] <- table[, 1]/table[, 2] + table[, 3] <- table[, 1] / table[, 2] table[, 4] <- 2 * pnorm(-abs(table[, 3])) print("Estimates and Significance Testing from from tsls") printCoefmat(table, digits = digits, P.values = TRUE, has.Pvalue = TRUE) @@ -158,4 +165,4 @@ summary.tsls <- function(object, digits = max(3L, getOption("digits") - } cat("\n") invisible(table) -} \ No newline at end of file +} diff --git a/man/AJR.Rd b/man/AJR.Rd index 69158a3..dd337fb 100644 --- a/man/AJR.Rd +++ b/man/AJR.Rd @@ -5,7 +5,7 @@ \alias{AJR} \title{AJR data set} \format{ -\describe{ +\describe{ \item{Mort}{Settler mortality} \item{logMort}{logarithm of Mort} \item{Latitude}{Latitude} @@ -16,7 +16,7 @@ \item{Samer}{South America} \item{Neo}{Neo-Europes} \item{GDP}{GDP} -\item{Exprop}{Average protection against expropriation risk} +\item{Exprop}{Average protection against expropriation risk} } } \description{ diff --git a/man/BLP.Rd b/man/BLP.Rd index 46a9603..be9d18a 100644 --- a/man/BLP.Rd +++ b/man/BLP.Rd @@ -5,7 +5,7 @@ \alias{BLP} \title{BLP data set} \format{ -\describe{ +\describe{ \item{model.name}{model name} \item{model.id}{model id} \item{firm.id}{firm id} @@ -27,7 +27,7 @@ Automobile data set from the US. } \details{ -Data set was analysed in Berry, Levinsohn and Pakes (1995). The data stem from annual issues of the Automotive News Market Data Book. +Data set was analysed in Berry, Levinsohn and Pakes (1995). The data stem from annual issues of the Automotive News Market Data Book. The data set inlcudes information on all models marketed during the the period beginning 1971 and ending in 1990 cotaining 2217 model/years from 997 distinct models. A detailed description is given in BLP (1995, 868--871). The internal function \code{constructIV} constructs instrumental variables along the lines described and used in BLP (1995). } diff --git a/man/EminentDomain.Rd b/man/EminentDomain.Rd index 226cbd1..6f897ec 100644 --- a/man/EminentDomain.Rd +++ b/man/EminentDomain.Rd @@ -5,7 +5,7 @@ \alias{EminentDomain} \title{Eminent Domain data set} \format{ -\describe{ +\describe{ \item{y}{economic outcome variable} \item{x}{set of exogenous variables} \item{d}{eminent domain decisions} @@ -16,14 +16,14 @@ Dataset on judicial eminent domain decisions. } \details{ -Data set was analyzed in Belloni et al. (2012). They estimate the effect of judicial eminent domain decisions on economic outcomes with instrumental variables (IV) in a setting high a large set of potential IVs. -A detailed decription of the data can be found at -\url{https://www.econometricsociety.org/publications/econometrica/2012/11/01/sparse-models-and-methods-optimal-instruments-application} -The data set contains four "sub-data sets" which differ mainly in the dependent variables: repeat-sales FHFA/OFHEO house price index for metro (FHFA) and non-metro (NM) area, the Case-Shiller home price index (CS), +Data set was analyzed in Belloni et al. (2012). They estimate the effect of judicial eminent domain decisions on economic outcomes with instrumental variables (IV) in a setting high a large set of potential IVs. +A detailed decription of the data can be found at +\url{https://www.econometricsociety.org/publications/econometrica/2012/11/01/sparse-models-and-methods-optimal-instruments-application} +The data set contains four "sub-data sets" which differ mainly in the dependent variables: repeat-sales FHFA/OFHEO house price index for metro (FHFA) and non-metro (NM) area, the Case-Shiller home price index (CS), and state-level GDP from the Bureau of Economic Analysis - all transformed with the logarithm. The structure of each subdata set is given above. In the data set the following variables and name conventions are used: -"numpanelskx_..." is the number of panels with at least k members with the characteristic following the "_". -The probability controls (names start with "F_prob_") follow a similar naming convention and give the probability of observing a panel with characteristic given following second "_" given the characteristics of the pool of judges available to be assigned to the case. +"numpanelskx_..." is the number of panels with at least k members with the characteristic following the "_". +The probability controls (names start with "F_prob_") follow a similar naming convention and give the probability of observing a panel with characteristic given following second "_" given the characteristics of the pool of judges available to be assigned to the case. Characteristics in the data for the control variables or instruments: \describe{ diff --git a/man/RMD_stable.Rd b/man/RMD_stable.Rd new file mode 100644 index 0000000..bb95574 --- /dev/null +++ b/man/RMD_stable.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rlassoAutoDML_helpers.R +\name{RMD_stable} +\alias{RMD_stable} +\title{RMD_stable} +\usage{ +RMD_stable( + Y, + D, + X, + p0, + D_LB, + D_add, + max_iter, + b, + is_alpha, + is_lasso, + c = 0.5, + alpha = 0.1, + tol = 1e-06 +) +} +\arguments{ +\item{Y}{} + +\item{D}{} + +\item{X}{} + +\item{p0}{} + +\item{D_LB}{} + +\item{D_add}{} + +\item{max_iter}{} + +\item{b}{} + +\item{is_alpha}{} + +\item{is_lasso}{} + +\item{tol}{} +} +\description{ +TODO: insert description on what this function does and document all +parameters +} +\details{ +TODO: set defaults for arguments +} diff --git a/man/TE.Rd b/man/TE.Rd index a5b9f13..cb5fc73 100644 --- a/man/TE.Rd +++ b/man/TE.Rd @@ -106,7 +106,7 @@ rlassoLATET(x, ...) \item{formula}{An object of class \code{Formula} of the form " y ~ x + d | x" with y the outcome variable, d treatment variable, and x exogenous variables.} -\item{data}{An optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. +\item{data}{An optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. If not found in data, the variables are taken from environment(formula), typically the environment from which \code{rlassoATE} is called.} \item{z}{instrumental variables (binary)} diff --git a/man/coef.rlassoEffects.Rd b/man/coef.rlassoEffects.Rd index ed85aa9..0f6e338 100644 --- a/man/coef.rlassoEffects.Rd +++ b/man/coef.rlassoEffects.Rd @@ -17,10 +17,10 @@ \item{complete}{general option of the function \code{coef}.} -\item{selection.matrix}{if TRUE, a selection matrix is returned that indicates the selected variables from each auxiliary regression. +\item{selection.matrix}{if TRUE, a selection matrix is returned that indicates the selected variables from each auxiliary regression. Default is set to FALSE.} -\item{include.targets}{if FALSE (by default) only the selected control variables are listed in the \code{selection.matrix}. If set to TRUE, +\item{include.targets}{if FALSE (by default) only the selected control variables are listed in the \code{selection.matrix}. If set to TRUE, the selection matrix will also indicate the selection of the target coefficients that are specified in the \code{rlassoEffects} call.} \item{...}{further arguments passed to functions coef or print.} @@ -39,20 +39,22 @@ Printing coefficients and selection matrix for S3 object \code{rlassoEffects}. I \examples{ library(hdm) set.seed(1) -n = 100 #sample size -p = 100 # number of variables -s = 7 # number of non-zero variables -X = matrix(rnorm(n*p), ncol=p) -colnames(X) <- paste("X", 1:p, sep="") -beta = c(rep(3,s), rep(0,p-s)) -y = 1 + X\%*\%beta + rnorm(n) -data = data.frame(cbind(y,X)) +n <- 100 # sample size +p <- 100 # number of variables +s <- 7 # number of non-zero variables +X <- matrix(rnorm(n * p), ncol = p) +colnames(X) <- paste("X", 1:p, sep = "") +beta <- c(rep(3, s), rep(0, p - s)) +y <- 1 + X \%*\% beta + rnorm(n) +data <- data.frame(cbind(y, X)) colnames(data)[1] <- "y" -lasso.effect = rlassoEffects(X, y, index=c(1,2,3,50), - method = "double selection") +lasso.effect <- rlassoEffects(X, y, + index = c(1, 2, 3, 50), + method = "double selection" +) coef(lasso.effect) # standard use of coef() - without selection matrix # with selection matrix coef(lasso.effect, selection.matrix = TRUE) # prettier output with print_coef (identical options as coef()) -print_coef(lasso.effect, selection.matrix = TRUE) +print_coef(lasso.effect, selection.matrix = TRUE) } diff --git a/man/coef.rlassoIV.Rd b/man/coef.rlassoIV.Rd index 99de465..2abf9fd 100644 --- a/man/coef.rlassoIV.Rd +++ b/man/coef.rlassoIV.Rd @@ -17,7 +17,7 @@ Default is set to FALSE. See section on details for more information.} \item{...}{further arguments passed to function coef.} } \value{ -Coefficients obtained from \code{rlassoIV} by default. If option \code{selection.matrix} is \code{TRUE}, a list is returned with final coefficients, a matrix \code{selection.matrix}, and a matrix \code{selection.matrixZ}: +Coefficients obtained from \code{rlassoIV} by default. If option \code{selection.matrix} is \code{TRUE}, a list is returned with final coefficients, a matrix \code{selection.matrix}, and a matrix \code{selection.matrixZ}: \code{selection.matrix} contains the selection index for the lasso regression of \code{y} on \code{x} (first column) and the lasso regression of the predicted values of \code{d} on \code{x} together with the union of these indizes. \code{selection.matrixZ} contains the selection index from the first-stage lasso regression of \code{d} on \code{z} and \code{x}. @@ -27,7 +27,7 @@ Method to extract coefficients from objects of class \code{rlassoIV}. } \details{ Printing coefficients and selection matrix for S3 object \code{rlassoIV}. \code{"x"} indicates that a variable has been selected, i.e., the corresponding estimated coefficient is different from zero. -The very last column collects all variables that have been selected in at least one of the lasso regressions represented in the \code{selection.matrix}. +The very last column collects all variables that have been selected in at least one of the lasso regressions represented in the \code{selection.matrix}. \code{rlassoIV} performs three lasso regression steps. A first stage lasso regression of the endogenous treatment variable \code{d} on the instruments \code{z} and exogenous covariates \code{x}, a lasso regression of \code{y} on the exogenous variables \code{x}, and a lasso regression of the instrumented treatment variable, i.e., a regression of the predicted values of \code{d}, on controls \code{x}. } @@ -38,7 +38,7 @@ z <- EminentDomain$logGDP$z # instruments x <- EminentDomain$logGDP$x # exogenous variables y <- EminentDomain$logGDP$y # outcome varialbe d <- EminentDomain$logGDP$d # treatment / endogenous variable -lasso.IV = rlassoIV(x=x, d=d, y=y, z=z, select.X=TRUE, select.Z=TRUE) +lasso.IV <- rlassoIV(x = x, d = d, y = y, z = z, select.X = TRUE, select.Z = TRUE) coef(lasso.IV) # default behavior coef(lasso.IV, selection.matrix = T) # print selection matrix } diff --git a/man/coef.rlassoIVselectX.Rd b/man/coef.rlassoIVselectX.Rd index 650b1ab..cbbc82e 100644 --- a/man/coef.rlassoIVselectX.Rd +++ b/man/coef.rlassoIVselectX.Rd @@ -7,7 +7,7 @@ \method{coef}{rlassoIVselectX}(object, complete = TRUE, selection.matrix = FALSE, ...) } \arguments{ -\item{object}{an object of class \code{rlassoIVselectX}, usually a result of a call +\item{object}{an object of class \code{rlassoIVselectX}, usually a result of a call \code{rlassoIVselectX} or \code{rlassoIV} with options \code{select.X=TRUE} and \code{select.Z=FALSE}.} @@ -30,12 +30,16 @@ the index of selected variables \code{x} for the instruments \code{z}. The very \examples{ \dontrun{ library(hdm) -data(AJR); y = AJR$GDP; d = AJR$Exprop; z = AJR$logMort -x = model.matrix(~ -1 + (Latitude + Latitude2 + Africa + - Asia + Namer + Samer)^2, data=AJR) -AJR.Xselect = rlassoIV(GDP ~ Exprop + (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2 | - logMort + (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2, - data=AJR, select.X=TRUE, select.Z=FALSE) +data(AJR) +y <- AJR$GDP +d <- AJR$Exprop +z <- AJR$logMort +x <- model.matrix(~ -1 + (Latitude + Latitude2 + Africa + + Asia + Namer + Samer)^2, data = AJR) +AJR.Xselect <- rlassoIV(GDP ~ Exprop + (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2 | + logMort + (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2, +data = AJR, select.X = TRUE, select.Z = FALSE +) coef(AJR.Xselect) # Default behavior coef(AJR.Xselect, selection.matrix = TRUE) # print selection matrix } diff --git a/man/coef.rlassoIVselectZ.Rd b/man/coef.rlassoIVselectZ.Rd index 9a6b701..425b4ec 100644 --- a/man/coef.rlassoIVselectZ.Rd +++ b/man/coef.rlassoIVselectZ.Rd @@ -7,7 +7,7 @@ \method{coef}{rlassoIVselectZ}(object, complete = TRUE, selection.matrix = FALSE, ...) } \arguments{ -\item{object}{an object of class \code{rlassoIVselectZ}, usually a result of a call +\item{object}{an object of class \code{rlassoIVselectZ}, usually a result of a call \code{rlassoIVselectZ} or \code{rlassoIV} with options \code{select.X=FALSE} and \code{select.Z=TRUE}.} @@ -28,13 +28,13 @@ The very last column collects all variables that have been selected in at least } \examples{ \dontrun{ -lasso.IV.Z = rlassoIVselectZ(x=x, d=d, y=y, z=z) +lasso.IV.Z <- rlassoIVselectZ(x = x, d = d, y = y, z = z) data(EminentDomain) z <- EminentDomain$logGDP$z # instruments x <- EminentDomain$logGDP$x # exogenous variables y <- EminentDomain$logGDP$y # outcome varialbe d <- EminentDomain$logGDP$d # treatment / endogenous variable -lasso.IV.Z = rlassoIVselectZ(x=x, d=d, y=y, z=z) +lasso.IV.Z <- rlassoIVselectZ(x = x, d = d, y = y, z = z) coef(lasso.IV.Z) # Default behavior coef(lasso.IV.Z, selection.matrix = T) } diff --git a/man/cps2012.Rd b/man/cps2012.Rd index bbe51d1..87f3b68 100644 --- a/man/cps2012.Rd +++ b/man/cps2012.Rd @@ -5,7 +5,7 @@ \alias{cps2012} \title{cps2012 data set} \format{ -\describe{ +\describe{ \item{lnw}{log of hourly wage (annual earnings / annual hours)} \item{female}{female indicator} \item{married status}{ six indicators: widowed, divorced, separated, nevermarried, and married (omitted)} @@ -20,9 +20,9 @@ Census data from the US for the year 2012. } \details{ -The CPS is a monthly U.S. household survey conducted jointly by the U.S. Census Bureau and the Bureau of Labor Statistics. The data comprise the year 2012. -This data set was used in Mulligan and Rubinstein (2008). -The sample comprises white non-hipanic, ages 25-54, working full time full year (35+ hours per week at least 50 weeks), exclude living in group quarters, +The CPS is a monthly U.S. household survey conducted jointly by the U.S. Census Bureau and the Bureau of Labor Statistics. The data comprise the year 2012. +This data set was used in Mulligan and Rubinstein (2008). +The sample comprises white non-hipanic, ages 25-54, working full time full year (35+ hours per week at least 50 weeks), exclude living in group quarters, self-employed, military, agricultural, and private household sector, allocated earning, inconsistent report on earnings and employment, missing data. } \examples{ diff --git a/man/hdm-package.Rd b/man/hdm-package.Rd index aa17c4d..a5295b6 100644 --- a/man/hdm-package.Rd +++ b/man/hdm-package.Rd @@ -10,10 +10,10 @@ This package implements methods for estimation and inference in a high-dimension } \details{ \tabular{ll}{ Package: \tab hdm\cr Type: \tab Package\cr Version: \tab -0.1\cr Date: \tab 2015-05-25\cr License: \tab GPL-3\cr } This package provides efficient estimators -and uniformly valid confidence intervals for various low-dimensional causal/structural parameters +0.1\cr Date: \tab 2015-05-25\cr License: \tab GPL-3\cr } This package provides efficient estimators +and uniformly valid confidence intervals for various low-dimensional causal/structural parameters appearing in high-dimensional approximately sparse models. The package -includes functions for fitting heteroskedastic robust Lasso regressions with non-Gaussian erros and +includes functions for fitting heteroskedastic robust Lasso regressions with non-Gaussian erros and for instrumental variable (IV) and treatment effect estimation in a high-dimensional setting. Moreover, the methods enable valid post-selection inference. Moreover, a theoretically grounded, data-driven choice of the penalty level is provided. diff --git a/man/lambdaCalculation.Rd b/man/lambdaCalculation.Rd index 436de7f..9903d62 100644 --- a/man/lambdaCalculation.Rd +++ b/man/lambdaCalculation.Rd @@ -12,7 +12,7 @@ lambdaCalculation( ) } \arguments{ -\item{penalty}{list with options for the calculation of the penalty. +\item{penalty}{list with options for the calculation of the penalty. \itemize{ \item{\code{c} and \code{gamma}}{ constants for the penalty with default \code{c=1.1} and \code{gamma=0.1}} \item{\code{homoscedastic}}{ logical, if homoscedastic errors are considered (default \code{FALSE}). Option \code{none} is described below.} diff --git a/man/methods.rlassoIV.Rd b/man/methods.rlassoIV.Rd index 4f1c6f6..35c5a5b 100644 --- a/man/methods.rlassoIV.Rd +++ b/man/methods.rlassoIV.Rd @@ -27,7 +27,7 @@ \item{level}{confidence level required.} } \description{ -Objects of class \code{rlassoIV} are constructed by \code{rlassoIV}. +Objects of class \code{rlassoIV} are constructed by \code{rlassoIV}. \code{print.rlassoIV} prints and displays some information about fitted \code{rlassoIV} objects. \code{summary.rlassoIV} summarizes information of a fitted \code{rlassoIV} object. \code{confint.rlassoIV} extracts the confidence intervals. diff --git a/man/methods.rlassoIVselectX.Rd b/man/methods.rlassoIVselectX.Rd index a70bdba..a081e40 100644 --- a/man/methods.rlassoIVselectX.Rd +++ b/man/methods.rlassoIVselectX.Rd @@ -27,7 +27,7 @@ \item{level}{the confidence level required.} } \description{ -Objects of class \code{rlassoIVselectX} are constructed by \code{rlassoIVselectX}. +Objects of class \code{rlassoIVselectX} are constructed by \code{rlassoIVselectX}. \code{print.rlassoIVselectX} prints and displays some information about fitted \code{rlassoIVselectX} objects. \code{summary.rlassoIVselectX} summarizes information of a fitted \code{rlassoIVselectX} object. \code{confint.rlassoIVselectX} extracts the confidence intervals. diff --git a/man/methods.rlassoIVselectZ.Rd b/man/methods.rlassoIVselectZ.Rd index 5133627..ed5f49a 100644 --- a/man/methods.rlassoIVselectZ.Rd +++ b/man/methods.rlassoIVselectZ.Rd @@ -27,7 +27,7 @@ \item{level}{confidence level required.} } \description{ -Objects of class \code{rlassoIVselectZ} are constructed by \code{rlassoIVselectZ}. +Objects of class \code{rlassoIVselectZ} are constructed by \code{rlassoIVselectZ}. \code{print.rlassoIVselectZ} prints and displays some information about fitted \code{rlassoIVselectZ} objects. \code{summary.rlassoIVselectZ} summarizes information of a fitted \code{rlassoIVselectZ} object. \code{confint.rlassoIVselectZ} extracts the confidence intervals. diff --git a/man/methods.rlassologitEffects.Rd b/man/methods.rlassologitEffects.Rd index b6a31c4..8387c7c 100644 --- a/man/methods.rlassologitEffects.Rd +++ b/man/methods.rlassologitEffects.Rd @@ -29,7 +29,7 @@ \item{joint}{logical, if joint confidence intervals should be clalculated} } \description{ -Objects of class \code{rlassologitEffects} are construced by \code{rlassologitEffects} or \code{rlassologitEffect}. +Objects of class \code{rlassologitEffects} are construced by \code{rlassologitEffects} or \code{rlassologitEffect}. \code{print.rlassologitEffects} prints and displays some information about fitted \code{rlassologitEffect} objects. \code{summary.rlassologitEffects} summarizes information of a fitted \code{rlassologitEffects} object. \code{confint.rlassologitEffects} extracts the confidence intervals. diff --git a/man/methods.tsls.Rd b/man/methods.tsls.Rd index c88a740..2fd17cf 100644 --- a/man/methods.tsls.Rd +++ b/man/methods.tsls.Rd @@ -20,7 +20,7 @@ \item{object}{an object of class \code{tsls}} } \description{ -Objects of class \code{tsls} are constructed by \code{tsls}. +Objects of class \code{tsls} are constructed by \code{tsls}. \code{print.tsls} prints and displays some information about fitted \code{tsls} objects. \code{summary.tsls} summarizes information of a fitted \code{tsls} object. } diff --git a/man/p_adjust.Rd b/man/p_adjust.Rd index 435fef7..482342f 100644 --- a/man/p_adjust.Rd +++ b/man/p_adjust.Rd @@ -56,23 +56,23 @@ Objects of class \code{rlassoEffects} are constructed by }} \examples{ -library(hdm); +library(hdm) set.seed(1) -n = 100 #sample size -p = 25 # number of variables -s = 3 # nubmer of non-zero variables -X = matrix(rnorm(n*p), ncol=p) -colnames(X) <- paste("X", 1:p, sep="") -beta = c(rep(3,s), rep(0,p-s)) -y = 1 + X\%*\%beta + rnorm(n) -data = data.frame(cbind(y,X)) +n <- 100 # sample size +p <- 25 # number of variables +s <- 3 # nubmer of non-zero variables +X <- matrix(rnorm(n * p), ncol = p) +colnames(X) <- paste("X", 1:p, sep = "") +beta <- c(rep(3, s), rep(0, p - s)) +y <- 1 + X \%*\% beta + rnorm(n) +data <- data.frame(cbind(y, X)) colnames(data)[1] <- "y" -lasso.effect = rlassoEffects(X, y, index=c(1:20)) -pvals.lasso.effect = p_adjust(lasso.effect, method = "RW", B = 1000) -ols = lm(y ~ -1 + X, data) -pvals.ols = p_adjust(ols, method = "RW", B = 1000) -pvals.ols = p_adjust(ols, method = "RW", B = 1000, test.index = c(1,2,5)) -pvals.ols = p_adjust(ols, method = "RW", B = 1000, test.index = c(rep(TRUE, 5), rep(FALSE, p-5))) +lasso.effect <- rlassoEffects(X, y, index = c(1:20)) +pvals.lasso.effect <- p_adjust(lasso.effect, method = "RW", B = 1000) +ols <- lm(y ~ -1 + X, data) +pvals.ols <- p_adjust(ols, method = "RW", B = 1000) +pvals.ols <- p_adjust(ols, method = "RW", B = 1000, test.index = c(1, 2, 5)) +pvals.ols <- p_adjust(ols, method = "RW", B = 1000, test.index = c(rep(TRUE, 5), rep(FALSE, p - 5))) } \references{ J.P. Romano, M. Wolf (2005). Exact and approximate stepdown diff --git a/man/print_coef.Rd b/man/print_coef.Rd index 862f5de..194e5e4 100644 --- a/man/print_coef.Rd +++ b/man/print_coef.Rd @@ -22,10 +22,10 @@ print_coef(x, ...) \item{complete}{general option of the function \code{coef}.} -\item{selection.matrix}{if TRUE, a selection matrix is returned that indicates the selected variables from each auxiliary regression. +\item{selection.matrix}{if TRUE, a selection matrix is returned that indicates the selected variables from each auxiliary regression. Default is set to FALSE.} -\item{include.targets}{if FALSE (by default) only the selected control variables are listed in the \code{selection.matrix}. If set to TRUE, +\item{include.targets}{if FALSE (by default) only the selected control variables are listed in the \code{selection.matrix}. If set to TRUE, the selection matrix will also indicate the selection of the target coefficients that are specified in the \code{rlassoEffects} call.} } \description{ @@ -37,19 +37,21 @@ Printing coefficients and selection matrix for S3 object \code{rlassoEffects} \examples{ library(hdm) set.seed(1) -n = 100 #sample size -p = 100 # number of variables -s = 7 # number of non-zero variables -X = matrix(rnorm(n*p), ncol=p) -colnames(X) <- paste("X", 1:p, sep="") -beta = c(rep(3,s), rep(0,p-s)) -y = 1 + X\%*\%beta + rnorm(n) -data = data.frame(cbind(y,X)) +n <- 100 # sample size +p <- 100 # number of variables +s <- 7 # number of non-zero variables +X <- matrix(rnorm(n * p), ncol = p) +colnames(X) <- paste("X", 1:p, sep = "") +beta <- c(rep(3, s), rep(0, p - s)) +y <- 1 + X \%*\% beta + rnorm(n) +data <- data.frame(cbind(y, X)) colnames(data)[1] <- "y" -lasso.effect = rlassoEffects(X, y, index=c(1,2,3,50), - method = "double selection") +lasso.effect <- rlassoEffects(X, y, + index = c(1, 2, 3, 50), + method = "double selection" +) # without target coefficient estimates -print_coef(lasso.effect, selection.matrix = TRUE) +print_coef(lasso.effect, selection.matrix = TRUE) # with target coefficient estimates -print_coef(lasso.effect, selection.matrix = TRUE, targets = TRUE) +print_coef(lasso.effect, selection.matrix = TRUE, targets = TRUE) } diff --git a/man/rlasso.Rd b/man/rlasso.Rd index 87a0ea2..7b466cd 100644 --- a/man/rlasso.Rd +++ b/man/rlasso.Rd @@ -67,7 +67,7 @@ penalized.} \item{model}{logical. If \code{TRUE} (default), model matrix is returned.} -\item{penalty}{list with options for the calculation of the penalty. +\item{penalty}{list with options for the calculation of the penalty. \itemize{ \item{\code{c} and \code{gamma}}{ constants for the penalty with default \code{c=1.1} and \code{gamma=0.1}} \item{\code{homoscedastic}}{ logical, if homoscedastic errors are considered (default \code{FALSE}). Option \code{none} is described below.} @@ -110,7 +110,7 @@ returned is of the S3 class \code{rlasso}. The function estimates the coefficients of a Lasso regression with data-driven penalty under homoscedasticity / heteroscedasticity and non-Gaussian noise. The options \code{homoscedastic} is a logical with \code{FALSE} by default. Moreover, for the calculation of the penalty parameter it can be chosen, if the penalization parameter depends on the design matrix (\code{X.dependent.lambda=TRUE}) or \code{independent} (default, \code{X.dependent.lambda=FALSE}). -The default value of the constant \code{c} is \code{1.1} in the post-Lasso case and \code{0.5} in the Lasso case. +The default value of the constant \code{c} is \code{1.1} in the post-Lasso case and \code{0.5} in the Lasso case. A \emph{special} option is to set \code{homoscedastic} to \code{none} and to supply a values \code{lambda.start}. Then this value is used as penalty parameter with independent design and heteroscedastic errors to weight the regressors. For details of the implementation of the Algorithm for estimation of the data-driven penalty, @@ -118,25 +118,25 @@ in particular the regressor-independent loadings, we refer to Appendix A in Belloni et al. (2012). When the option "none" is chosen for \code{homoscedastic} (together with \code{lambda.start}), lambda is set to \code{lambda.start} and the regressor-independent loadings und heteroscedasticity are used. The options "X-dependent" and -"X-independent" under homoscedasticity are described in Belloni et al. (2013). +"X-independent" under homoscedasticity are described in Belloni et al. (2013). The option \code{post=TRUE} conducts post-lasso estimation, i.e. a refit of the model with the selected variables. } \examples{ set.seed(1) -n = 100 #sample size -p = 100 # number of variables -s = 3 # nubmer of variables with non-zero coefficients -X = Xnames = matrix(rnorm(n*p), ncol=p) -colnames(Xnames) <- paste("V", 1:p, sep="") -beta = c(rep(5,s), rep(0,p-s)) -Y = X\%*\%beta + rnorm(n) -reg.lasso <- rlasso(Y~Xnames) -Xnew = matrix(rnorm(n*p), ncol=p) # new X -colnames(Xnew) <- paste("V", 1:p, sep="") -Ynew = Xnew\%*\%beta + rnorm(n) #new Y -yhat = predict(reg.lasso, newdata = Xnew) +n <- 100 # sample size +p <- 100 # number of variables +s <- 3 # nubmer of variables with non-zero coefficients +X <- Xnames <- matrix(rnorm(n * p), ncol = p) +colnames(Xnames) <- paste("V", 1:p, sep = "") +beta <- c(rep(5, s), rep(0, p - s)) +Y <- X \%*\% beta + rnorm(n) +reg.lasso <- rlasso(Y ~ Xnames) +Xnew <- matrix(rnorm(n * p), ncol = p) # new X +colnames(Xnew) <- paste("V", 1:p, sep = "") +Ynew <- Xnew \%*\% beta + rnorm(n) # new Y +yhat <- predict(reg.lasso, newdata = Xnew) } \references{ A. Belloni, D. Chen, V. Chernozhukov and C. Hansen (2012). diff --git a/man/rlassoDML.Rd b/man/rlassoDML.Rd new file mode 100644 index 0000000..4fd1666 --- /dev/null +++ b/man/rlassoDML.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rlassoAutoDML.R +\name{rlassoAutoDML} +\alias{rlassoAutoDML} +\title{Auto DML based on rlasso} +\usage{ +rlassoAutoDML( + Y, + D, + X, + dict = NULL, + D_LB = 0, + D_add = 0.2, + bias = FALSE, + L = 5, + max_iter = 10 +) +} +\arguments{ +\item{Y}{A vector of outputs} + +\item{D}{A vector of treatment values} + +\item{X}{A matrix of covariates} + +\item{dict}{A dictionary +TODO: explain what the dictionary does and state what default does} + +\item{D_LB}{TODO: documentation for this parameter (including default)} + +\item{D_add}{TODO: documentation for this parameter (including default)} + +\item{bias}{debiased vs. biased results +TODO: do users really need the biased results or are they just used for +illustration purposes} + +\item{L}{TODO: documentation for this parameter (including default)} + +\item{max_iter}{TODO: documentation for this parameter (including default)} +} +\value{ +list with average treatment effect and standard error +} +\description{ +TODO: +insert a meaningful description and include references from the literature. +} +\examples{ +# TODO include a quick and independent example (including data loading/generation) +# rlassoAutoDML(Y, T, X, dict) +} diff --git a/man/rlassoEffects.Rd b/man/rlassoEffects.Rd index 6446eac..4262640 100644 --- a/man/rlassoEffects.Rd +++ b/man/rlassoEffects.Rd @@ -52,7 +52,7 @@ In the case of partialling out it is ignored.} \item{formula}{An element of class \code{formula} specifying the linear model.} -\item{data}{an optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. +\item{data}{an optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. If not found in data, the variables are taken from environment(formula), typically the environment from which the function is called.} \item{I}{An one-sided formula specifying the variables for which inference is conducted.} @@ -78,29 +78,44 @@ the outcome variable on the control variables and the treatment variable on the control variables. The final estimation is done by a regression of the outcome on the treatment effect and the union of the selected variables in the first two steps. In partialling-out first the effect of the regressors on the outcome and the treatment variable is taken out by Lasso and then a regression of the residuals is conducted. The resulting estimator for \eqn{\alpha_0} is normal -distributed which allows inference on the treatment effect. It presents a wrap function for \code{rlassoEffect} +distributed which allows inference on the treatment effect. It presents a wrap function for \code{rlassoEffect} which does inference for a single variable. } \examples{ -library(hdm); library(ggplot2) +library(hdm) +library(ggplot2) set.seed(1) -n = 100 #sample size -p = 100 # number of variables -s = 3 # number of non-zero variables -X = matrix(rnorm(n*p), ncol=p) -colnames(X) <- paste("X", 1:p, sep="") -beta = c(rep(3,s), rep(0,p-s)) -y = 1 + X\%*\%beta + rnorm(n) -data = data.frame(cbind(y,X)) +n <- 100 # sample size +p <- 100 # number of variables +s <- 3 # number of non-zero variables +X <- matrix(rnorm(n * p), ncol = p) +colnames(X) <- paste("X", 1:p, sep = "") +beta <- c(rep(3, s), rep(0, p - s)) +y <- 1 + X \%*\% beta + rnorm(n) +data <- data.frame(cbind(y, X)) colnames(data)[1] <- "y" -fm = paste("y ~", paste(colnames(X), collapse="+")) -fm = as.formula(fm) -lasso.effect = rlassoEffects(X, y, index=c(1,2,3,50)) -lasso.effect = rlassoEffects(fm, I = ~ X1 + X2 + X3 + X50, data=data) +fm <- paste("y ~", paste(colnames(X), collapse = "+")) +fm <- as.formula(fm) +lasso.effect <- rlassoEffects(X, y, index = c(1, 2, 3, 50)) +lasso.effect <- rlassoEffects(fm, I = ~ X1 + X2 + X3 + X50, data = data) print(lasso.effect) summary(lasso.effect) confint(lasso.effect) plot(lasso.effect) +# library(hdm) +# ## DGP +# n <- 250 +# p <- 100 +# px <- 10 +# X <- matrix(rnorm(n*p), ncol=p) +# beta <- c(rep(2,px), rep(0,p-px)) +# intercept <- 1 +# y <- intercept + X \%*\% beta + rnorm(n) +# ## fit rlassoEffects object with inference on three variables +# rlassoEffects.reg <- rlassoEffects(x=X, y=y, index=c(1,7,20)) +# ## methods +# summary(rlassoEffects.reg) +# confint(rlassoEffects.reg, level=0.9) } \references{ A. Belloni, V. Chernozhukov, C. Hansen (2014). Inference on diff --git a/man/rlassoIV.Rd b/man/rlassoIV.Rd index c595f11..81038e5 100644 --- a/man/rlassoIV.Rd +++ b/man/rlassoIV.Rd @@ -36,7 +36,7 @@ rlassoIVmult(x, d, y, z, select.Z = TRUE, select.X = TRUE, ...) \item{formula}{An object of class \code{Formula} of the form " y ~ x + d | x + z" with y the outcome variable, d endogenous variable, z instrumental variables, and x exogenous variables.} -\item{data}{an optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. +\item{data}{an optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. If not found in data, the variables are taken from environment(formula), typically the environment from which \code{rlassoIV} is called.} } \value{ @@ -52,8 +52,8 @@ controls and very many instruments (even larger than the sample size). The implementation for selection on x and z follows the procedure described in Chernozhukov et al. (2015) and is built on 'triple selection' to achieve an orthogonal moment function. The function returns an object of S3 class \code{rlassoIV}. -Moreover, it is wrap function for the case that selection should be done only with the instruments Z (\code{rlassoIVselectZ}) or with -the control variables X (\code{rlassoIVselectX}) or without selection (\code{tsls}). Exogenous variables +Moreover, it is wrap function for the case that selection should be done only with the instruments Z (\code{rlassoIVselectZ}) or with +the control variables X (\code{rlassoIVselectX}) or without selection (\code{tsls}). Exogenous variables \code{x} are automatically used as instruments and added to the instrument set \code{z}. } @@ -64,7 +64,7 @@ z <- EminentDomain$logGDP$z # instruments x <- EminentDomain$logGDP$x # exogenous variables y <- EminentDomain$logGDP$y # outcome varialbe d <- EminentDomain$logGDP$d # treatment / endogenous variable -lasso.IV.Z = rlassoIV(x=x, d=d, y=y, z=z, select.X=FALSE, select.Z=TRUE) +lasso.IV.Z <- rlassoIV(x = x, d = d, y = y, z = z, select.X = FALSE, select.Z = TRUE) summary(lasso.IV.Z) confint(lasso.IV.Z) } diff --git a/man/rlassoIVselectX.Rd b/man/rlassoIVselectX.Rd index a4cf1b0..468ff00 100644 --- a/man/rlassoIVselectX.Rd +++ b/man/rlassoIVselectX.Rd @@ -28,7 +28,7 @@ rlassoIVselectX(x, ...) \item{formula}{An object of class \code{Formula} of the form " y ~ x + d | x + z" with y the outcome variable, d endogenous variable, z instrumental variables, and x exogenous variables.} -\item{data}{An optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. +\item{data}{An optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. If not found in data, the variables are taken from environment(formula), typically the environment from which \code{rlassoIVselectX} is called.} } \value{ @@ -45,20 +45,24 @@ The function returns an element of class \code{rlassoIVselectX} \details{ The implementation is a special case of of Chernozhukov et al. (2015). The option \code{post=TRUE} conducts post-lasso estimation for the Lasso estimations, i.e. a refit of the -model with the selected variables. Exogenous variables +model with the selected variables. Exogenous variables \code{x} are automatically used as instruments and added to the instrument set \code{z}. } \examples{ library(hdm) -data(AJR); y = AJR$GDP; d = AJR$Exprop; z = AJR$logMort -x = model.matrix(~ -1 + (Latitude + Latitude2 + Africa + - Asia + Namer + Samer)^2, data=AJR) +data(AJR) +y <- AJR$GDP +d <- AJR$Exprop +z <- AJR$logMort +x <- model.matrix(~ -1 + (Latitude + Latitude2 + Africa + + Asia + Namer + Samer)^2, data = AJR) dim(x) - #AJR.Xselect = rlassoIV(x=x, d=d, y=y, z=z, select.X=TRUE, select.Z=FALSE) - AJR.Xselect = rlassoIV(GDP ~ Exprop + (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2 | - logMort + (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2, - data=AJR, select.X=TRUE, select.Z=FALSE) +# AJR.Xselect = rlassoIV(x=x, d=d, y=y, z=z, select.X=TRUE, select.Z=FALSE) +AJR.Xselect <- rlassoIV(GDP ~ Exprop + (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2 | + logMort + (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2, +data = AJR, select.X = TRUE, select.Z = FALSE +) summary(AJR.Xselect) confint(AJR.Xselect) } diff --git a/man/rlassoIVselectZ.Rd b/man/rlassoIVselectZ.Rd index 1b777d3..ba8b4cd 100644 --- a/man/rlassoIVselectZ.Rd +++ b/man/rlassoIVselectZ.Rd @@ -31,7 +31,7 @@ Exogenous variables serve as their own instruments.} \item{formula}{An object of class \code{Formula} of the form " y ~ x + d | x + z" with y the outcome variable, d endogenous variable, z instrumental variables, and x exogenous variables.} -\item{data}{An optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. +\item{data}{An optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. If not found in data, the variables are taken from environment(formula), typically the environment from which \code{rlassoIVselectZ} is called.} } \value{ diff --git a/man/rlassologit.Rd b/man/rlassologit.Rd index 61cf5ee..3c2ebc2 100644 --- a/man/rlassologit.Rd +++ b/man/rlassologit.Rd @@ -99,19 +99,19 @@ set.seed(2) n <- 250 p <- 100 px <- 10 -X <- matrix(rnorm(n*p), ncol=p) -beta <- c(rep(2,px), rep(0,p-px)) +X <- matrix(rnorm(n * p), ncol = p) +beta <- c(rep(2, px), rep(0, p - px)) intercept <- 1 -P <- exp(intercept + X \%*\% beta)/(1+exp(intercept + X \%*\% beta)) -y <- rbinom(length(y), size=1, prob=P) +P <- exp(intercept + X \%*\% beta) / (1 + exp(intercept + X \%*\% beta)) +y <- rbinom(length(y), size = 1, prob = P) ## fit rlassologit object -rlassologit.reg <- rlassologit(y~X) +rlassologit.reg <- rlassologit(y ~ X) ## methods -summary(rlassologit.reg, all=F) +summary(rlassologit.reg, all = F) print(rlassologit.reg) -predict(rlassologit.reg, type='response') -X3 <- matrix(rnorm(n*p), ncol=p) -predict(rlassologit.reg, newdata=X3) +predict(rlassologit.reg, type = "response") +X3 <- matrix(rnorm(n * p), ncol = p) +predict(rlassologit.reg, newdata = X3) } } \references{ diff --git a/man/rlassologitEffects.Rd b/man/rlassologitEffects.Rd index 3db703e..bb9aea9 100644 --- a/man/rlassologitEffects.Rd +++ b/man/rlassologitEffects.Rd @@ -33,7 +33,7 @@ indicates if variables (TRUE) should be included in any case.} \item{formula}{An element of class \code{formula} specifying the linear model.} -\item{data}{an optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. +\item{data}{an optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. If not found in data, the variables are taken from environment(formula), typically the environment from which the function is called.} \item{I}{An one-sided formula specifying the variables for which inference is conducted.} @@ -63,16 +63,16 @@ set.seed(2) n <- 250 p <- 100 px <- 10 -X <- matrix(rnorm(n*p), ncol=p) -colnames(X) = paste("V", 1:p, sep="") -beta <- c(rep(2,px), rep(0,p-px)) +X <- matrix(rnorm(n * p), ncol = p) +colnames(X) <- paste("V", 1:p, sep = "") +beta <- c(rep(2, px), rep(0, p - px)) intercept <- 1 -P <- exp(intercept + X \%*\% beta)/(1+exp(intercept + X \%*\% beta)) -y <- rbinom(n, size=1, prob=P) -xd <- X[,2:50] -d <- X[,1] -logit.effect <- rlassologitEffect(x=xd, d=d, y=y) -logit.effects <- rlassologitEffects(X,y, index=c(1,2,40)) +P <- exp(intercept + X \%*\% beta) / (1 + exp(intercept + X \%*\% beta)) +y <- rbinom(n, size = 1, prob = P) +xd <- X[, 2:50] +d <- X[, 1] +logit.effect <- rlassologitEffect(x = xd, d = d, y = y) +logit.effects <- rlassologitEffects(X, y, index = c(1, 2, 40)) logit.effects.f <- rlassologitEffects(y ~ X, I = ~ V1 + V2) } } diff --git a/man/tsls.Rd b/man/tsls.Rd index 5adcf05..febe964 100644 --- a/man/tsls.Rd +++ b/man/tsls.Rd @@ -30,7 +30,7 @@ tsls(x, ...) \item{formula}{An object of class \code{Formula} of the form " y ~ x + d | x + z" with y the outcome variable, d endogenous variable, z instrumental variables, and x exogenous variables.} -\item{data}{An optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. +\item{data}{An optional data frame, list or environment (or object coercible by as.data.frame to a data frame) containing the variables in the model. If not found in data, the variables are taken from environment(formula), typically the environment from which \code{tsls} is called.} } \value{ @@ -44,6 +44,6 @@ The function does Two-Stage Least Squares Estimation (TSLS). \details{ The function computes tsls estimate (coefficients) and variance-covariance-matrix assuming homoskedasticity for outcome variable \code{y} where \code{d} are endogenous variables in structural equation, \code{x} are exogensous variables in -structural equation and z are instruments. It returns an object of class \code{tsls} for which the methods \code{print} and \code{summary} +structural equation and z are instruments. It returns an object of class \code{tsls} for which the methods \code{print} and \code{summary} are provided. } diff --git a/tests/testthat.R b/tests/testthat.R index 705ba55..765dfde 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,3 +1,3 @@ library(testthat) library(hdm) -#test_check("hdm") +# test_check("hdm") diff --git a/tests/testthat/DGPs.R b/tests/testthat/DGPs.R index b60c581..ec279df 100644 --- a/tests/testthat/DGPs.R +++ b/tests/testthat/DGPs.R @@ -1,81 +1,81 @@ DGP.IV <- -function(n=250, p=100, pnz=50, Fstat=180, control=list(s2e=1, Cev=.6, s2z=1, szz=.5, pi1=1, alpha=1)) { - s2e <- control$s2e - Cev <- control$Cev - s2z <- control$s2z - szz <- control$szz - pi1 <- control$pi1 - alpha <- control$alpha - indp <- 0:(p-1) - - SZ <- s2z*toeplitz((szz)^indp) - cSZ <- chol(SZ) - cFS <- (c(rep(1,pnz), rep(0,p-pnz))*pi1)^indp - scale <- matrix(0, nrow=1, ncol=1) - s2v <- matrix(0, nrow=1, ncol=1) - scale<- sqrt(Fstat/((Fstat+n)*t(cFS)%*%SZ%*%cFS)) - s2v <- 1-(scale^2)*t(cFS)%*%SZ%*%cFS - sev <- Cev*sqrt(s2e)*sqrt(s2v) - SU <- matrix(c(s2e,sev,sev,s2v), ncol=2) - cSU <- chol(SU) - - zorig <- matrix(rnorm(n*p), nrow=n,ncol=p)%*%cSZ - U <- matrix(rnorm(n*2), ncol=2)%*%cSU - xorig <- scale[1,1]*zorig%*%cFS+U[,2] - yorig <- alpha*xorig+U[,1] - Z <- zorig - colMeans(zorig) #rep(1,n)*mean(zorig) - X <- xorig - mean(xorig) - Y <- yorig - mean(yorig) - return(list(X=X, y=Y, Z=Z, setup=list(alpha=alpha, Pi=cFS, scale=scale, s2v=s2v, sev=sev, SU=SU))) -} + function(n = 250, p = 100, pnz = 50, Fstat = 180, control = list(s2e = 1, Cev = .6, s2z = 1, szz = .5, pi1 = 1, alpha = 1)) { + s2e <- control$s2e + Cev <- control$Cev + s2z <- control$s2z + szz <- control$szz + pi1 <- control$pi1 + alpha <- control$alpha + indp <- 0:(p - 1) + + SZ <- s2z * toeplitz((szz)^indp) + cSZ <- chol(SZ) + cFS <- (c(rep(1, pnz), rep(0, p - pnz)) * pi1)^indp + scale <- matrix(0, nrow = 1, ncol = 1) + s2v <- matrix(0, nrow = 1, ncol = 1) + scale <- sqrt(Fstat / ((Fstat + n) * t(cFS) %*% SZ %*% cFS)) + s2v <- 1 - (scale^2) * t(cFS) %*% SZ %*% cFS + sev <- Cev * sqrt(s2e) * sqrt(s2v) + SU <- matrix(c(s2e, sev, sev, s2v), ncol = 2) + cSU <- chol(SU) + + zorig <- matrix(rnorm(n * p), nrow = n, ncol = p) %*% cSZ + U <- matrix(rnorm(n * 2), ncol = 2) %*% cSU + xorig <- scale[1, 1] * zorig %*% cFS + U[, 2] + yorig <- alpha * xorig + U[, 1] + Z <- zorig - colMeans(zorig) # rep(1,n)*mean(zorig) + X <- xorig - mean(xorig) + Y <- yorig - mean(yorig) + return(list(X = X, y = Y, Z = Z, setup = list(alpha = alpha, Pi = cFS, scale = scale, s2v = s2v, sev = sev, SU = SU))) + } -DGP.HC <- function(n=250, p=100, alpha=0.5, design=1, R2=c(0.5,0.5)) { - Sigma <- toeplitz(0.5^(0:(p-1))) +DGP.HC <- function(n = 250, p = 100, alpha = 0.5, design = 1, R2 = c(0.5, 0.5)) { + Sigma <- toeplitz(0.5^(0:(p - 1))) C <- chol(Sigma) - X <- matrix(rnorm(n*p), nrow=n,ncol=p)%*%C - beta01 <- 1/(1:p)^2 - beta02 <- 1/(1:p)^2 - - if (design==1) { - sigma_d <- 1 - sigma_y <- 1 - } - if (design==2) { - sigma_d <- sqrt((1+X%*%beta01)^2/mean((1+X%*%beta01)^2)) - } - - cy <- sqrt(R2[1]/((1-R2[1])*t(as.matrix(alpha*beta02+ beta01))%*%Sigma%*%as.matrix(alpha*beta02 + beta01))) - beta01 <- cy * beta01 - cd <- sqrt(R2[2]/((1-R2[2])*t(as.matrix(beta02))%*%Sigma%*%as.matrix(beta02))) - beta02 <- cd * beta02 - d <- X%*%beta02 + sigma_d*rnorm(n) - - if (design==2) { - sigma_y <- sqrt((1+alpha*d+X%*%beta01)^2/mean((1+ alpha*d + X%*%beta01)^2)) + X <- matrix(rnorm(n * p), nrow = n, ncol = p) %*% C + beta01 <- 1 / (1:p)^2 + beta02 <- 1 / (1:p)^2 + + if (design == 1) { + sigma_d <- 1 + sigma_y <- 1 + } + if (design == 2) { + sigma_d <- sqrt((1 + X %*% beta01)^2 / mean((1 + X %*% beta01)^2)) } + cy <- sqrt(R2[1] / ((1 - R2[1]) * t(as.matrix(alpha * beta02 + beta01)) %*% Sigma %*% as.matrix(alpha * beta02 + beta01))) + beta01 <- cy * beta01 + cd <- sqrt(R2[2] / ((1 - R2[2]) * t(as.matrix(beta02)) %*% Sigma %*% as.matrix(beta02))) + beta02 <- cd * beta02 + d <- X %*% beta02 + sigma_d * rnorm(n) + + if (design == 2) { + sigma_y <- sqrt((1 + alpha * d + X %*% beta01)^2 / mean((1 + alpha * d + X %*% beta01)^2)) + } - y <- alpha*d+ X%*%beta01+ sigma_y*rnorm(n) - return(list(y=y,X=X,d=d)) + y <- alpha * d + X %*% beta01 + sigma_y * rnorm(n) + + return(list(y = y, X = X, d = d)) } -DGP.HCHIV <- function(n=250, px=300, pz=150, alpha=1) { - Sigma <- toeplitz(0.5^(0:(px-1))) +DGP.HCHIV <- function(n = 250, px = 300, pz = 150, alpha = 1) { + Sigma <- toeplitz(0.5^(0:(px - 1))) Sx <- chol(Sigma) - Se <- chol(matrix(c(1, .6, .6,1), ncol=2)) - - beta <- gamma <- 1/(1:px)^2 - delta <- 1/(1:pz)^2 - theta <- rbind(diag(pz), matrix(0, ncol=px-pz, nrow=pz)) - - X <- matrix(rnorm(n*px), nrow=n, ncol=px)%*%Sx - Z <- X%*%theta + 0.5*matrix(rnorm(n*pz), ncol=pz) - - e <- matrix(rnorm(n*2), ncol=2) - - d <- Z%*%delta + X%*%gamma + e[,1] - y <- d*alpha + X%*%beta + e[,2] - - return(list(y=y, d=d, X=X, Z=Z)) -} \ No newline at end of file + Se <- chol(matrix(c(1, .6, .6, 1), ncol = 2)) + + beta <- gamma <- 1 / (1:px)^2 + delta <- 1 / (1:pz)^2 + theta <- rbind(diag(pz), matrix(0, ncol = px - pz, nrow = pz)) + + X <- matrix(rnorm(n * px), nrow = n, ncol = px) %*% Sx + Z <- X %*% theta + 0.5 * matrix(rnorm(n * pz), ncol = pz) + + e <- matrix(rnorm(n * 2), ncol = 2) + + d <- Z %*% delta + X %*% gamma + e[, 1] + y <- d * alpha + X %*% beta + e[, 2] + + return(list(y = y, d = d, X = X, Z = Z)) +} diff --git a/tests/testthat/examples.R b/tests/testthat/examples.R index ec064b2..bd10a5a 100644 --- a/tests/testthat/examples.R +++ b/tests/testthat/examples.R @@ -14,7 +14,7 @@ #' } #' ## fit rlogisticlasso object #' rlogisticlasso.reg <- rlogisticlasso(x=X, y=y) -#' +#' #' ## methods #' summary(rlogisticlasso.reg, all=F) #' print(rlogisticlasso.reg) @@ -29,39 +29,37 @@ #' n <- 250 #' p <- 100 #' px <- 10 -#' X <- matrix(rnorm(n*p), ncol=p) -#' beta <- c(rep(2,px), rep(0,p-px)) +#' X <- matrix(rnorm(n * p), ncol = p) +#' beta <- c(rep(2, px), rep(0, p - px)) #' intercept <- 1 #' y <- intercept + X %*% beta + rnorm(n) #' ## fit rlassoLM object with inference on three variables -#' rlassoLM.reg <- rlassoLM(x=X, y=y, index=c(1,7,20)) +#' rlassoLM.reg <- rlassoLM(x = X, y = y, index = c(1, 7, 20)) #' ## methods #' summary(rlassoLM.reg) #' print(rlassoLM.reg) -#' confint(rlassoLM.reg, level=0.9) - - +#' confint(rlassoLM.reg, level = 0.9) set.seed(2) n <- 250 p <- 100 px <- 10 -X <- matrix(rnorm(n*p), ncol=p) -beta <- c(rep(2,px), rep(0,p-px)) +X <- matrix(rnorm(n * p), ncol = p) +beta <- c(rep(2, px), rep(0, p - px)) intercept <- 1 -P <- exp(intercept + X %*% beta)/(1+exp(intercept + X %*% beta)) -y <- numeric(length=250) -for(i in 1:n){ - y[i] <- sample(x=c(1,0), size=1, prob=c(P[i],1-P[i])) +P <- exp(intercept + X %*% beta) / (1 + exp(intercept + X %*% beta)) +y <- numeric(length = 250) +for (i in 1:n) { + y[i] <- sample(x = c(1, 0), size = 1, prob = c(P[i], 1 - P[i])) } ## fit rlogisticlasso object -rlogisticlasso.reg <- rlogisticlasso(x=X, y=y) +rlogisticlasso.reg <- rlogisticlasso(x = X, y = y) ## methods -summary(rlogisticlasso.reg, all=F) +summary(rlogisticlasso.reg, all = F) print(rlogisticlasso.reg) -head(predict(rlogisticlasso.reg, type="response")) -X3 <- matrix(rnorm(n*p), ncol=p) -head(predict(rlogisticlasso.reg, newdata=X3)) +head(predict(rlogisticlasso.reg, type = "response")) +X3 <- matrix(rnorm(n * p), ncol = p) +head(predict(rlogisticlasso.reg, newdata = X3)) @@ -71,19 +69,19 @@ set.seed(2) n <- 250 p <- 100 px <- 10 -X <- matrix(rnorm(n*p), ncol=p) -beta <- c(rep(2,px), rep(0,p-px)) +X <- matrix(rnorm(n * p), ncol = p) +beta <- c(rep(2, px), rep(0, p - px)) intercept <- 1 -P <- exp(intercept + X %*% beta)/(1+exp(intercept + X %*% beta)) -y <- numeric(length=250) -for(i in 1:n){ - y[i] <- sample(x=c(1,0), size=1, prob=c(P[i],1-P[i])) +P <- exp(intercept + X %*% beta) / (1 + exp(intercept + X %*% beta)) +y <- numeric(length = 250) +for (i in 1:n) { + y[i] <- sample(x = c(1, 0), size = 1, prob = c(P[i], 1 - P[i])) } - ## fit rlogisticlasso object - rlogisticlasso.reg <- rlogisticlasso(x=X, y=y) - ## methods -summary(rlogisticlasso.reg, all=F) +## fit rlogisticlasso object +rlogisticlasso.reg <- rlogisticlasso(x = X, y = y) +## methods +summary(rlogisticlasso.reg, all = F) print(rlogisticlasso.reg) -predict(rlogisticlasso.reg, type="response") -X3 <- matrix(rnorm(n*p), ncol=p) -predict(rlogisticlasso.reg, newdata=X3) +predict(rlogisticlasso.reg, type = "response") +X3 <- matrix(rnorm(n * p), ncol = p) +predict(rlogisticlasso.reg, newdata = X3) diff --git a/tests/testthat/formula_test.R b/tests/testthat/formula_test.R index 503d978..d2aeef8 100644 --- a/tests/testthat/formula_test.R +++ b/tests/testthat/formula_test.R @@ -1,49 +1,49 @@ -rm(list=ls()) +rm(list = ls()) set.seed(12345) n <- 100 p <- 4 Y <- rnorm(n) d <- rnorm(n) -X <- matrix(rnorm(n*p), ncol=p) -Z <- matrix(rnorm(n*p), ncol=p) -colnames(X) <- paste("X", 1:p, sep="") -colnames(Z) <- paste("Z", 1:p, sep="") -mat <- cbind(Y,d,X,Z) +X <- matrix(rnorm(n * p), ncol = p) +Z <- matrix(rnorm(n * p), ncol = p) +colnames(X) <- paste("X", 1:p, sep = "") +colnames(Z) <- paste("Z", 1:p, sep = "") +mat <- cbind(Y, d, X, Z) colnames(mat)[1:2] <- c("y", "d") dat <- as.data.frame(mat) -form <- as.Formula(Y ~ d + X1+ X2 + X3+ X4 | Z1 + Z2 + Z3 + Z4 + X1 + X2 + X3 + X4) -dataf <- f.formula(Y ~ d + X1+ X2 + X3+ X4 | X1 + X2 + X3 + X4, data=dat) -dataf <- f.formula(Y ~ d + X1+ X2 + X3+ X4 | Z1 + Z2 + Z3 + Z4 + X1 + X2 + X3 + X4, data=dat) +form <- as.Formula(Y ~ d + X1 + X2 + X3 + X4 | Z1 + Z2 + Z3 + Z4 + X1 + X2 + X3 + X4) +dataf <- f.formula(Y ~ d + X1 + X2 + X3 + X4 | X1 + X2 + X3 + X4, data = dat) +dataf <- f.formula(Y ~ d + X1 + X2 + X3 + X4 | Z1 + Z2 + Z3 + Z4 + X1 + X2 + X3 + X4, data = dat) dataf$Y dataf$D head(dataf$X) head(dataf$Z) ## test function include -i <- include(I = ~ X1+ X2, dat) -test1 <- rlassoEffects(Y ~ X1 + X2+ Z1 + Z2, I= ~ X1, data=dat) -#test2 <- rlassoEffects(Y ~ X1 + X2+ Z1 + Z2, I= ~ X1) -test3 <- rlassoEffects(y ~ X1 + X2+ Z1 + Z2, I = ~ X1, include= ~X1, data=dat) -test4 <- rlassoEffects(y ~ X1 + X2+ Z1 + Z2, I = ~ X1, include= ~X3, data=dat) -test5 <- rlassoEffects(y ~ (X1 + X2)^2 + Z1 + Z2, I = ~ X1, include= ~X3, data=dat) -test6 <- rlassoEffects(y ~ (X1 + X2)^2 + Z1 + Z2, I = ~ X1:X2, include= ~X3, data=dat) -test7 <- rlassoEffects(y ~ (X1 + X2)^2 + Z1 + Z2, I = ~ X1 + X2, include= ~X3, data=dat) -test8 <- rlassoEffects(y ~ (X1 + X2)^2 + Z1 + Z2, I = ~ X1 + X2, include= ~ NULL, data=dat) +i <- include(I = ~ X1 + X2, dat) +test1 <- rlassoEffects(Y ~ X1 + X2 + Z1 + Z2, I = ~X1, data = dat) +# test2 <- rlassoEffects(Y ~ X1 + X2+ Z1 + Z2, I= ~ X1) +test3 <- rlassoEffects(y ~ X1 + X2 + Z1 + Z2, I = ~X1, include = ~X1, data = dat) +test4 <- rlassoEffects(y ~ X1 + X2 + Z1 + Z2, I = ~X1, include = ~X3, data = dat) +test5 <- rlassoEffects(y ~ (X1 + X2)^2 + Z1 + Z2, I = ~X1, include = ~X3, data = dat) +test6 <- rlassoEffects(y ~ (X1 + X2)^2 + Z1 + Z2, I = ~ X1:X2, include = ~X3, data = dat) +test7 <- rlassoEffects(y ~ (X1 + X2)^2 + Z1 + Z2, I = ~ X1 + X2, include = ~X3, data = dat) +test8 <- rlassoEffects(y ~ (X1 + X2)^2 + Z1 + Z2, I = ~ X1 + X2, include = ~NULL, data = dat) ############################ Test IV functions -test1a <- rlassoIVselectX(form, data=dat) -test1b <- rlassoIVselectX(x=X, z=Z, y=Y, d=d) -test2a <- rlassoIVselectZ(form, data=dat) -test2b <- rlassoIVselectZ(x=X, z=Z, y=Y, d=d) -#testold <- rlassoIVselectZ(x=X, d=d, y=Y, z=Z) -test3a <- rlassoIV(form, data=dat) -test3b <- rlassoIV(x=X, z=Z, y=Y, d=d) -test4a <- tsls(form, data=dat) -test4b <- tsls(x=X, z=Z, y=Y, d=d) +test1a <- rlassoIVselectX(form, data = dat) +test1b <- rlassoIVselectX(x = X, z = Z, y = Y, d = d) +test2a <- rlassoIVselectZ(form, data = dat) +test2b <- rlassoIVselectZ(x = X, z = Z, y = Y, d = d) +# testold <- rlassoIVselectZ(x=X, d=d, y=Y, z=Z) +test3a <- rlassoIV(form, data = dat) +test3b <- rlassoIV(x = X, z = Z, y = Y, d = d) +test4a <- tsls(form, data = dat) +test4b <- tsls(x = X, z = Z, y = Y, d = d) all.equal(test1a, test1b) all.equal(test2a, test2b) @@ -62,34 +62,34 @@ d1 <- rnorm(n) d2 <- rnorm(n) d <- cbind(d1, d2) colnames(d) <- c("d1", "d2") -X <- matrix(rnorm(n*p), ncol=p) -Z <- matrix(rnorm(n*p), ncol=p) -colnames(X) <- paste("X", 1:p, sep="") -colnames(Z) <- paste("Z", 1:p, sep="") -mat <- cbind(Y,d1, d2, X,Z) +X <- matrix(rnorm(n * p), ncol = p) +Z <- matrix(rnorm(n * p), ncol = p) +colnames(X) <- paste("X", 1:p, sep = "") +colnames(Z) <- paste("Z", 1:p, sep = "") +mat <- cbind(Y, d1, d2, X, Z) colnames(mat)[1:3] <- c("y", "d1", "d2") dat <- as.data.frame(mat) -form <- as.Formula(Y ~ d1 + d2 + X1+ X2 + X3+ X4 | Z1 + Z2 + Z3 + Z4 + X1 + X2 + X3 + X4) +form <- as.Formula(Y ~ d1 + d2 + X1 + X2 + X3 + X4 | Z1 + Z2 + Z3 + Z4 + X1 + X2 + X3 + X4) ######################################################################################### -dataf <- f.formula(Y ~ d1 + d2 + X1+ X2 + X3+ X4 | X1 + X2 + X3 + X4, data=dat) -dataf <- f.formula(Y ~ d1 + d2 + X1+ X2 + X3+ X4 | Z1 + Z2 + Z3 + Z4 + X1 + X2 + X3 + X4, data=dat) +dataf <- f.formula(Y ~ d1 + d2 + X1 + X2 + X3 + X4 | X1 + X2 + X3 + X4, data = dat) +dataf <- f.formula(Y ~ d1 + d2 + X1 + X2 + X3 + X4 | Z1 + Z2 + Z3 + Z4 + X1 + X2 + X3 + X4, data = dat) dataf$Y dataf$D head(dataf$X) head(dataf$Z) ############################ Test IV functions -test1a <- rlassoIVselectX(form, data=dat) -test1b <- rlassoIVselectX(x=X, z=Z, y=Y, d=d) #dim(d)[2]==1 !!! -test2a <- rlassoIVselectZ(form, data=dat) -test2b <- rlassoIVselectZ(x=X, z=Z, y=Y, d=d) #dim(d)[2]>1 works in principle -test3a <- rlassoIV(form, data=dat) -test3b <- rlassoIV(x=X, z=Z, y=Y, d=d) #dim(d)[2]==1 !!! -test4a <- tsls(form, data=dat) -test4b <- tsls(x=X, z=Z, y=Y, d=d) #dim(d)[2]>1 works in principle +test1a <- rlassoIVselectX(form, data = dat) +test1b <- rlassoIVselectX(x = X, z = Z, y = Y, d = d) # dim(d)[2]==1 !!! +test2a <- rlassoIVselectZ(form, data = dat) +test2b <- rlassoIVselectZ(x = X, z = Z, y = Y, d = d) # dim(d)[2]>1 works in principle +test3a <- rlassoIV(form, data = dat) +test3b <- rlassoIV(x = X, z = Z, y = Y, d = d) # dim(d)[2]==1 !!! +test4a <- tsls(form, data = dat) +test4b <- tsls(x = X, z = Z, y = Y, d = d) # dim(d)[2]>1 works in principle all.equal(test1a, test1b) all.equal(test2a, test2b) @@ -106,24 +106,24 @@ set.seed(12345) n <- 100 p <- 4 Y <- rnorm(n) -d <- sample(c(0,1), size=n, replace=TRUE) -X <- matrix(rnorm(n*p), ncol=p) -Z <- sample(c(0,1), size=n, replace=TRUE) -colnames(X) <- paste("X", 1:p, sep="") -mat <- cbind(Y,d,Z,X) +d <- sample(c(0, 1), size = n, replace = TRUE) +X <- matrix(rnorm(n * p), ncol = p) +Z <- sample(c(0, 1), size = n, replace = TRUE) +colnames(X) <- paste("X", 1:p, sep = "") +mat <- cbind(Y, d, Z, X) colnames(mat)[1:3] <- c("y", "d", "Z") dat <- as.data.frame(mat) -form <- as.Formula(Y ~ d + X1+ X2 + X3+ X4 | Z + X1 + X2 + X3 + X4) -form2 <- as.Formula(Y ~ d + X1+ X2 + X3+ X4 | X1 + X2 + X3 + X4) -test1a <- rlassoATE(form2, data=dat) -test1b <- rlassoATE(x=X, y=Y, d=d) -test2a <- rlassoATET(form2, data=dat) -test2b <- rlassoATET(x=X, y=Y, d=d) -test3a <- rlassoLATE(form, data=dat) -test3b <- rlassoLATE(x=X, z=Z, y=Y, d=d) -test4a <- rlassoLATET(form, data=dat) -test4b <- rlassoLATET(x=X, z=Z, y=Y, d=d) +form <- as.Formula(Y ~ d + X1 + X2 + X3 + X4 | Z + X1 + X2 + X3 + X4) +form2 <- as.Formula(Y ~ d + X1 + X2 + X3 + X4 | X1 + X2 + X3 + X4) +test1a <- rlassoATE(form2, data = dat) +test1b <- rlassoATE(x = X, y = Y, d = d) +test2a <- rlassoATET(form2, data = dat) +test2b <- rlassoATET(x = X, y = Y, d = d) +test3a <- rlassoLATE(form, data = dat) +test3b <- rlassoLATE(x = X, z = Z, y = Y, d = d) +test4a <- rlassoLATET(form, data = dat) +test4b <- rlassoLATET(x = X, z = Z, y = Y, d = d) all.equal(test1a, test1b) all.equal(test2a, test2b) @@ -143,15 +143,15 @@ all.equal(test4a, test4b) ############################### Test treatment effects -form2 <- as.Formula(Y ~ d1 + d2 + X1+ X2 + X3+ X4 | X1 + X2 + X3 + X4) -test1a <- rlassoATE(form2, data=dat) -test1b <- rlassoATE(x=X, y=Y, d=d) -test2a <- rlassoATET(form2, data=dat) -test2b <- rlassoATET(x=X, y=Y, d=d) -test3a <- rlassoLATE(form, data=dat) -test3b <- rlassoLATE(x=X, z=Z, y=Y, d=d) -test4a <- rlassoLATET(form, data=dat) -test4b <- rlassoLATET(x=X, z=Z, y=Y, d=d) +form2 <- as.Formula(Y ~ d1 + d2 + X1 + X2 + X3 + X4 | X1 + X2 + X3 + X4) +test1a <- rlassoATE(form2, data = dat) +test1b <- rlassoATE(x = X, y = Y, d = d) +test2a <- rlassoATET(form2, data = dat) +test2b <- rlassoATET(x = X, y = Y, d = d) +test3a <- rlassoLATE(form, data = dat) +test3b <- rlassoLATE(x = X, z = Z, y = Y, d = d) +test4a <- rlassoLATET(form, data = dat) +test4b <- rlassoLATET(x = X, z = Z, y = Y, d = d) all.equal(test1a, test1b) all.equal(test2a, test2b) @@ -161,4 +161,3 @@ all.equal(test4a, test4b) ################################################################################################################################## - diff --git a/tests/testthat/test_LassoShooting.fit.R b/tests/testthat/test_LassoShooting.fit.R index 2589e37..8ec2793 100644 --- a/tests/testthat/test_LassoShooting.fit.R +++ b/tests/testthat/test_LassoShooting.fit.R @@ -2,15 +2,15 @@ context("Test LassoShooting.fit") library(hdm) library(testthat) -DPG_lassoShooting <- function(n, p, px, lambda0 = 110, min = 0.85, max = 1.15){ - -X <- matrix(rnorm(n*p), ncol=p) -beta <- c(rep(2,px), rep(0,p-px)) -y <- X %*% beta + rnorm(n) -loadings <- runif(p, min = min, max = max) -lambda <- lambda0 * loadings - -list(X = X, y = y, beta = beta, lambda = lambda, lambda0 = lambda0, loadings = loadings) +DPG_lassoShooting <- function(n, p, px, lambda0 = 110, min = 0.85, max = 1.15) { + + X <- matrix(rnorm(n * p), ncol = p) + beta <- c(rep(2, px), rep(0, p - px)) + y <- X %*% beta + rnorm(n) + loadings <- runif(p, min = min, max = max) + lambda <- lambda0 * loadings + + list(X = X, y = y, beta = beta, lambda = lambda, lambda0 = lambda0, loadings = loadings) } @@ -23,7 +23,7 @@ lambda <- ret$lambda rm(ret) -test_that("LassoShooting.fit - Input check x, y and lambda",{ +test_that("LassoShooting.fit - Input check x, y and lambda", { expect_is(LassoShooting.fit(X, y, lambda), "list") expect_is(LassoShooting.fit(X, as.vector(y), lambda), "list") expect_is(LassoShooting.fit(X[, 1, drop = FALSE], y, lambda), "list") @@ -31,9 +31,9 @@ test_that("LassoShooting.fit - Input check x, y and lambda",{ }) -test_that("LassoShooting.fit - Input check control, XX, Xy and beta start",{ +test_that("LassoShooting.fit - Input check control, XX, Xy and beta start", { expect_is(LassoShooting.fit(X, y, lambda, control = list(maxIter = 150, optTol = 10^(-4), zeroThreshold = 10^(-5))), "list") expect_is(LassoShooting.fit(X, y, lambda, XX = (t(X) %*% X) * 0.8), "list") expect_is(LassoShooting.fit(X, y, lambda, Xy = (t(X) %*% y) * 0.8), "list") - expect_is(LassoShooting.fit(X, y, lambda, beta.start = rep(1,100)), "list") -}) \ No newline at end of file + expect_is(LassoShooting.fit(X, y, lambda, beta.start = rep(1, 100)), "list") +}) diff --git a/tests/testthat/test_rlasso.R b/tests/testthat/test_rlasso.R index b20dca6..86d18ab 100644 --- a/tests/testthat/test_rlasso.R +++ b/tests/testthat/test_rlasso.R @@ -3,14 +3,14 @@ context("Test rlasso") library(hdm) -DGP_rlasso <- function(n, p, px){ - - X <- matrix(rnorm(n*p), ncol=p) - colnames(X) <- paste("x", 1:p, sep="") - beta <- c(rep(2,px), rep(0,p-px)) +DGP_rlasso <- function(n, p, px) { + + X <- matrix(rnorm(n * p), ncol = p) + colnames(X) <- paste("x", 1:p, sep = "") + beta <- c(rep(2, px), rep(0, p - px)) intercept <- 1 y <- intercept + X %*% beta + rnorm(n) - + list(X = X, y = y, beta = beta) } @@ -25,7 +25,7 @@ colnames(frame) <- c("y", paste0("x", 1:100)) rm(ret) -test_that("rlasso - Input check x and y",{ +test_that("rlasso - Input check x and y", { expect_is(rlasso(X, y), "rlasso") expect_is(rlasso(as.data.frame(X), y), "rlasso") expect_is(rlasso(X, as.vector(y)), "rlasso") @@ -36,7 +36,7 @@ test_that("rlasso - Input check x and y",{ expect_is(rlasso(as.data.frame(X[, 1, drop = FALSE]), as.vector(y)), "rlasso") }) -test_that("rlasso - formula",{ +test_that("rlasso - formula", { expect_is(rlasso(y ~ X), "rlasso") expect_is(rlasso(y ~ ., data = frame), "rlasso") expect_is(rlasso(y ~ x1, data = frame), "rlasso") @@ -44,7 +44,7 @@ test_that("rlasso - formula",{ }) -test_that("rlasso - Input check post, intercept and normalize",{ +test_that("rlasso - Input check post, intercept and normalize", { expect_is(rlasso(X, y, post = FALSE), "rlasso") expect_is(rlasso(X, y, intercept = FALSE), "rlasso") expect_is(rlasso(X, y, normalize = FALSE), "rlasso") @@ -52,24 +52,26 @@ test_that("rlasso - Input check post, intercept and normalize",{ expect_is(rlasso(as.data.frame(X), y, intercept = FALSE, normalize = FALSE), "rlasso") }) -test_that("rlasso - Input check penalty",{ - expect_is(rlasso(X, y, penalty = list(homoscedastic = FALSE, X.dependent.lambda =FALSE)), "rlasso") - expect_is(rlasso(X, y, penalty = list(homoscedastic = TRUE, X.dependent.lambda =FALSE)), "rlasso") +test_that("rlasso - Input check penalty", { + expect_is(rlasso(X, y, penalty = list(homoscedastic = FALSE, X.dependent.lambda = FALSE)), "rlasso") + expect_is(rlasso(X, y, penalty = list(homoscedastic = TRUE, X.dependent.lambda = FALSE)), "rlasso") expect_is(rlasso(X, y, penalty = list(homoscedastic = FALSE, X.dependent.lambda = TRUE)), "rlasso") expect_is(rlasso(X, y, penalty = list(homoscedastic = TRUE, X.dependent.lambda = TRUE, numSim = 4000)), "rlasso") - expect_is(rlasso(X, y, penalty = list(homoscedastic = "none", X.dependent.lambda =FALSE, lambda.start = 100)), "rlasso") + expect_is(rlasso(X, y, penalty = list(homoscedastic = "none", X.dependent.lambda = FALSE, lambda.start = 100)), "rlasso") expect_is(rlasso(X, y, penalty = list(homoscedastic = "none", X.dependent.lambda = TRUE, lambda.start = 100)), "rlasso") - expect_is(rlasso(X, y, intercept = FALSE, penalty = list(homoscedastic = "none", X.dependent.lambda =FALSE, lambda.start = 100)), "rlasso") - expect_is(rlasso(X, y, penalty = list(homoscedastic = FALSE, X.dependent.lambda =FALSE, - lambda.start = NULL, c = 1.1, gamma = 0.1)), "rlasso") + expect_is(rlasso(X, y, intercept = FALSE, penalty = list(homoscedastic = "none", X.dependent.lambda = FALSE, lambda.start = 100)), "rlasso") + expect_is(rlasso(X, y, penalty = list( + homoscedastic = FALSE, X.dependent.lambda = FALSE, + lambda.start = NULL, c = 1.1, gamma = 0.1 + )), "rlasso") }) -test_that("rlasso - Input check control",{ - expect_is(rlasso(X, y, control = list(numIter = 15, tol = 10^-4, threshold = 10^-3)),"rlasso") - expect_is(rlasso(X, y, control = list(numIter = 25)), "rlasso") +test_that("rlasso - Input check control", { + expect_is(rlasso(X, y, control = list(numIter = 15, tol = 10^-4, threshold = 10^-3)), "rlasso") + expect_is(rlasso(X, y, control = list(numIter = 25)), "rlasso") }) -test_that("rlasso - check methods",{ +test_that("rlasso - check methods", { expect_error(summary(rlasso(X, y), all = FALSE), NA) expect_error(print(rlasso(X, y)), NA) expect_error(model.matrix(rlasso(X, y)), NA) @@ -82,7 +84,5 @@ test_that("rlasso - check methods",{ expect_error(predict(rlasso(X, y), as.data.frame(2 * X)), NA) expect_error(predict(rlasso(y ~ X), as.data.frame(2 * X)), NA) expect_error(predict(rlasso(y ~ ., data = frame), as.data.frame(2 * X)), NA) - #expect_that(predict(rlasso(y ~ ., data = frame), cbind(frame, rnorm(nrow(frame)))), not(throws_error())) + # expect_that(predict(rlasso(y ~ ., data = frame), cbind(frame, rnorm(nrow(frame)))), not(throws_error())) }) - - diff --git a/tests/testthat/test_rlassoEffect.R b/tests/testthat/test_rlassoEffect.R index e638091..9863fb6 100644 --- a/tests/testthat/test_rlassoEffect.R +++ b/tests/testthat/test_rlassoEffect.R @@ -2,13 +2,13 @@ context("Test rlassoEffect") library(hdm) library(testthat) -DGP_rlasso <- function(n, p, px){ - - X <- matrix(rnorm(n*p), ncol=p) - beta <- c(rep(2,px), rep(0,p-px)) +DGP_rlasso <- function(n, p, px) { + + X <- matrix(rnorm(n * p), ncol = p) + beta <- c(rep(2, px), rep(0, p - px)) intercept <- 1 y <- intercept + X %*% beta + rnorm(n) - + list(X = X, y = y, beta = beta) } @@ -23,40 +23,40 @@ colnames(frame) <- c("y", paste0("x", 1:100)) rm(ret) -test_that("rlassoEffect - Input check x, y and d",{ - expect_is(rlassoEffect(X, y, d = X[ ,1]), "rlassoEffects") - expect_is(rlassoEffect(X, as.vector(y), d = X[ ,1]), "rlassoEffects") - expect_is(rlassoEffect(X[, 1, drop = FALSE], y, d = X[ ,1]), "rlassoEffects") - expect_is(rlassoEffect(X[, 1, drop = FALSE], as.vector(y), d = X[ ,1]), "rlassoEffects") +test_that("rlassoEffect - Input check x, y and d", { + expect_is(rlassoEffect(X, y, d = X[, 1]), "rlassoEffects") + expect_is(rlassoEffect(X, as.vector(y), d = X[, 1]), "rlassoEffects") + expect_is(rlassoEffect(X[, 1, drop = FALSE], y, d = X[, 1]), "rlassoEffects") + expect_is(rlassoEffect(X[, 1, drop = FALSE], as.vector(y), d = X[, 1]), "rlassoEffects") }) -test_that("rlassoEffect - Input check I3",{ - expect_is(rlassoEffect(X, y, d = X[, 1], I3 = c(rep(TRUE,2),rep(FALSE,2), TRUE)), "rlassoEffects") - expect_is(rlassoEffect(X, y, d = X[, 1], I3 = c(rep(TRUE,55),rep(FALSE,44), TRUE)), "rlassoEffects") +test_that("rlassoEffect - Input check I3", { + expect_is(rlassoEffect(X, y, d = X[, 1], I3 = c(rep(TRUE, 2), rep(FALSE, 2), TRUE)), "rlassoEffects") + expect_is(rlassoEffect(X, y, d = X[, 1], I3 = c(rep(TRUE, 55), rep(FALSE, 44), TRUE)), "rlassoEffects") }) -test_that("rlassoEffect - Input check post, intercept and normalize",{ - expect_is(rlassoEffect(X, y, d = X[ ,1], post = FALSE), "rlassoEffects") - expect_is(rlassoEffect(X, y, d = X[ ,1], intercept = FALSE), "rlassoEffects") - expect_is(rlassoEffect(X, y, d = X[ ,1], normalize = FALSE), "rlassoEffects") - expect_is(rlassoEffect(X, y, d = X[ ,1], normalize = FALSE, intercept = FALSE), "rlassoEffects") +test_that("rlassoEffect - Input check post, intercept and normalize", { + expect_is(rlassoEffect(X, y, d = X[, 1], post = FALSE), "rlassoEffects") + expect_is(rlassoEffect(X, y, d = X[, 1], intercept = FALSE), "rlassoEffects") + expect_is(rlassoEffect(X, y, d = X[, 1], normalize = FALSE), "rlassoEffects") + expect_is(rlassoEffect(X, y, d = X[, 1], normalize = FALSE, intercept = FALSE), "rlassoEffects") }) -test_that("rlassoEffect - Input check penalty",{ - expect_is(rlassoEffect(X, y, d = X[ ,1], penalty = list(homoscedastic = FALSE, X.dependent.lambda =FALSE)), "rlassoEffects") - expect_is(rlassoEffect(X, y, d = X[ ,1], penalty = list(homoscedastic = TRUE, X.dependent.lambda =FALSE)), "rlassoEffects") - expect_is(rlassoEffect(X, y, d = X[ ,1], penalty = list(homoscedastic = FALSE, X.dependent.lambda =TRUE, numSim = 4000)), "rlassoEffects") - expect_is(rlassoEffect(X, y, d = X[ ,1], penalty = list(homoscedastic = TRUE, X.dependent.lambda =TRUE, numSim = 4000)), "rlassoEffects") - expect_is(rlassoEffect(X, y, d = X[ ,1], penalty = list(homoscedastic = "none", X.dependent.lambda =FALSE, lambda.start = 100)), "rlassoEffects") - expect_is(rlassoEffect(X, y, d = X[ ,1], penalty = list(homoscedastic = "none", X.dependent.lambda =TRUE, lambda.start = 100)), "rlassoEffects") - expect_is(rlassoEffect(X, y, d = X[ ,1], intercept = FALSE, penalty = list(homoscedastic = "none", X.dependent.lambda =FALSE, lambda.start = 100)), "rlassoEffects") - expect_is(rlassoEffect(X, y, d = X[ ,1], penalty = list(homoscedastic = FALSE, X.dependent.lambda =FALSE, - lambda.start = NULL, c = 1.1, gamma = 0.1)), "rlassoEffects") +test_that("rlassoEffect - Input check penalty", { + expect_is(rlassoEffect(X, y, d = X[, 1], penalty = list(homoscedastic = FALSE, X.dependent.lambda = FALSE)), "rlassoEffects") + expect_is(rlassoEffect(X, y, d = X[, 1], penalty = list(homoscedastic = TRUE, X.dependent.lambda = FALSE)), "rlassoEffects") + expect_is(rlassoEffect(X, y, d = X[, 1], penalty = list(homoscedastic = FALSE, X.dependent.lambda = TRUE, numSim = 4000)), "rlassoEffects") + expect_is(rlassoEffect(X, y, d = X[, 1], penalty = list(homoscedastic = TRUE, X.dependent.lambda = TRUE, numSim = 4000)), "rlassoEffects") + expect_is(rlassoEffect(X, y, d = X[, 1], penalty = list(homoscedastic = "none", X.dependent.lambda = FALSE, lambda.start = 100)), "rlassoEffects") + expect_is(rlassoEffect(X, y, d = X[, 1], penalty = list(homoscedastic = "none", X.dependent.lambda = TRUE, lambda.start = 100)), "rlassoEffects") + expect_is(rlassoEffect(X, y, d = X[, 1], intercept = FALSE, penalty = list(homoscedastic = "none", X.dependent.lambda = FALSE, lambda.start = 100)), "rlassoEffects") + expect_is(rlassoEffect(X, y, d = X[, 1], penalty = list( + homoscedastic = FALSE, X.dependent.lambda = FALSE, + lambda.start = NULL, c = 1.1, gamma = 0.1 + )), "rlassoEffects") }) -test_that("rlassoEffect - Input check control",{ - expect_is(rlassoEffect(X, y, d = X[ ,1], control = list(numIter = 15, tol = 10^-4, threshold = 10^-3)),"rlassoEffects") - expect_is(rlassoEffect(X, y, d = X[ ,1], control = list(numIter = 25)), "rlassoEffects") +test_that("rlassoEffect - Input check control", { + expect_is(rlassoEffect(X, y, d = X[, 1], control = list(numIter = 15, tol = 10^-4, threshold = 10^-3)), "rlassoEffects") + expect_is(rlassoEffect(X, y, d = X[, 1], control = list(numIter = 25)), "rlassoEffects") }) - - diff --git a/vignettes/hdm.R b/vignettes/hdm.R new file mode 100644 index 0000000..c5b0c02 --- /dev/null +++ b/vignettes/hdm.R @@ -0,0 +1,430 @@ +## ----R_setup, include=FALSE--------------------------------------------------- +library(knitr) +knitr::opts_chunk$set(warning = FALSE) + +opts_chunk$set(tidy=TRUE, tidy.opts=list(width.cutoff=80)) #{r, tidy=TRUE, tidy.opts=list(width.cutoff=60)} + +# # source: https://github.com/yihui/knitr-examples/blob/master/077-wrap-output.Rmd +# knitr::opts_chunk$set(linewidth = NULL) +# hook_output = knit_hooks$get('output') +# knit_hooks$set(output = function(x, options) { +# # this hook is used only when the linewidth option is not NULL +# if (!is.null(n <- options$linewidth)) { +# x = knitr:::split_lines(x) +# # any lines wider than n should be wrapped +# if (any(nchar(x) > n)) x = strwrap(x, width = n) +# x = paste(x, collapse = '\n') +# } +# hook_output(x, options) +# }) + +## ----eval=FALSE--------------------------------------------------------------- +# install.packages("hdm") + +## ----eval=FALSE--------------------------------------------------------------- +# install.packages("hdm", repos="http://R-Forge.R-project.org") + +## ----echo=TRUE---------------------------------------------------------------- +library(hdm) + +## ----echo=FALSE,results='hide'---------------------------- +library(hdm); library(ggplot2); library(Formula) +options(width=60) + + +## ----eval=FALSE------------------------------------------- +# help(package="hdm") +# help(rlasso) + +## ----eval=FALSE------------------------------------------- +# help.start() + +## ----eval=FALSE------------------------------------------- +# example(rlasso) + +## ----DGP_lasso1------------------------------------------- +set.seed(12345) +n = 100 #sample size +p = 100 # number of variables +s = 3 # nubmer of variables with non-zero coefficients +X = matrix(rnorm(n*p), ncol=p) +beta = c(rep(5,s), rep(0,p-s)) +Y = X%*%beta + rnorm(n) + +## ----Estimation_lasso1------------------------------------ +lasso.reg = rlasso(Y~X,post=FALSE) # use lasso, not-Post-lasso +# lasso.reg = rlasso(X, Y, post=FALSE) +sum.lasso <- summary(lasso.reg, all=FALSE) # can also do print(lasso.reg, all=FALSE) +yhat.lasso = predict(lasso.reg) #in-sample prediction +Xnew = matrix(rnorm(n*p), ncol=p) # new X +Ynew = Xnew%*%beta + rnorm(n) #new Y +yhat.lasso.new = predict(lasso.reg, newdata=Xnew) #out-of-sample prediction + +post.lasso.reg = rlasso(Y~X,post=TRUE) #now use post-lasso +print(post.lasso.reg, all=FALSE) # or use summary(post.lasso.reg, all=FALSE) +yhat.postlasso = predict(post.lasso.reg) #in-sample prediction +yhat.postlasso.new = predict(post.lasso.reg, newdata=Xnew) #out-of-sample prediction +MAE<- apply(cbind(abs(Ynew-yhat.lasso.new), abs(Ynew - yhat.postlasso.new)),2, mean) +names(MAE)<- c("lasso MAE", "Post-lasso MAE") +print(MAE, digits=2) # MAE for Lasso and Post-Lasso + +## ----simulation_partialling_out--------------------------- +set.seed(1) +n =5000; p = 20; X = matrix(rnorm(n*p), ncol=p) +colnames(X) = c("d", paste("x", 1:19, sep=""));xnames = colnames(X)[-1] +beta = rep(1,20) +y = X%*%beta + rnorm(n) +dat = data.frame(y=y, X) + +## ----simulation_partialling_out_full_fit------------------ +# full fit +fmla = as.formula(paste("y ~ ", paste(colnames(X), collapse= "+"))) +full.fit= lm(fmla, data=dat) +summary(full.fit)$coef["d",1:2] + +## ----simulation_partialling_out_partial_fit--------------- +fmla.y = as.formula(paste("y ~ ", paste(xnames, collapse= "+"))) +fmla.d = as.formula(paste("d ~ ", paste(xnames, collapse= "+"))) +# partial fit via ols +rY = lm(fmla.y, data = dat)$res +rD = lm(fmla.d, data = dat)$res +partial.fit.ls= lm(rY~rD) +summary(partial.fit.ls)$coef["rD",1:2] + +## ----simulation_partialling_out_partial_fit_lasso--------- +# partial fit via post-lasso +rY = rlasso(fmla.y, data =dat)$res +rD = rlasso(fmla.d, data =dat)$res +partial.fit.postlasso= lm(rY~rD) +summary(partial.fit.postlasso)$coef["rD",1:2] + +## ----simulation_partialling_out_rlassoEffectone----------- +Eff= rlassoEffect(X[,-1],y,X[,1], method="partialling out") +summary(Eff)$coef[,1:2] + +## ----simulation_doubleselection_rlassoEffectone----------- +Eff= rlassoEffect(X[,-1],y,X[,1], method="double selection") +summary(Eff)$coef[,1:2] + +## ----DGP_lasso-------------------------------------------- +set.seed(1) +n = 100 #sample size +p = 100 # number of variables +s = 3 # nubmer of non-zero variables +X = matrix(rnorm(n*p), ncol=p) +colnames(X) <- paste("X", 1:p, sep="") +beta = c(rep(3,s), rep(0,p-s)) +y = 1 + X%*%beta + rnorm(n) +data = data.frame(cbind(y,X)) +colnames(data)[1] <- "y" +fm = paste("y ~", paste(colnames(X), collapse="+")) +fm = as.formula(fm) + +## ----Estimation_inference--------------------------------- +#lasso.effect = rlassoEffects(X, y, index=c(1,2,3,50)) +lasso.effect = rlassoEffects(fm, I = ~ X1 + X2 + X3 + X50, data=data) +print(lasso.effect) +summary(lasso.effect) +confint(lasso.effect) + +## ----joint_Estimation_inference--------------------------- +confint(lasso.effect, level=0.95, joint=TRUE) + +## ----lasso_plot, eval=FALSE------------------------------- +# plot(lasso.effect, main="Confidence Intervals") + +## ----cps_example------------------------------------------ +library(hdm) +data(cps2012) +X <- model.matrix( ~ -1 + female + female:(widowed + divorced + separated + nevermarried + +hsd08+hsd911+ hsg+cg+ad+mw+so+we+exp1+exp2+exp3) + ++ (widowed + divorced + separated + nevermarried + + hsd08+hsd911+ hsg+cg+ad+mw+so+we+exp1+exp2+exp3)^2, data=cps2012) +dim(X) +X <- X[,which(apply(X, 2, var)!=0)] # exclude all constant variables +dim(X) +index.gender <- grep("female", colnames(X)) +y <- cps2012$lnw + +## ----Joint_Estimation------------------------------------- +effects.female <- rlassoEffects(x=X, y=y, index=index.gender) +summary(effects.female) + +## ----CPS_example------------------------------------------ +joint.CI <- confint(effects.female, level = 0.95, joint = TRUE) +joint.CI +# plot(effects.female, joint=TRUE, level=0.95) # plot of the effects + +## ----CPS_Example_formula, eval=FALSE---------------------- +# effects.female <- rlassoEffects(lnw ~ female + female:(widowed + divorced + separated + nevermarried + +# hsd08+hsd911+ hsg+cg+ad+mw+so+we+exp1+exp2+exp3)+ +# (widowed + divorced + separated + nevermarried + +# hsd08+hsd911+ hsg+cg+ad+mw+so+we+exp1+exp2+exp3)^2, data=cps2012, +# I = ~ female + female:(widowed + divorced + separated + nevermarried + +# hsd08+hsd911+ hsg+cg+ad+mw+so+we+exp1+exp2+exp3)) + +## ----Growth_processing------------------------------------ +data(GrowthData) +dim(GrowthData) +y = GrowthData[,1,drop=F] +d = GrowthData[,3, drop=F] +X = as.matrix(GrowthData)[,-c(1,2,3)] +varnames = colnames(GrowthData) + +## --------------------------------------------------------- +xnames= varnames[-c(1,2,3)] # names of X variables +dandxnames= varnames[-c(1,2)] # names of D and X variables +# create formulas by pasting names (this saves typing times) +fmla= as.formula(paste("Outcome ~ ", paste(dandxnames, collapse= "+"))) +ls.effect= lm(fmla, data=GrowthData) + +## ----Growth_analysis-------------------------------------- +dX = as.matrix(cbind(d,X)) +lasso.effect = rlassoEffect(x=X, y=y, d=d, method="partialling out") +summary(lasso.effect) + +## ----Growth_analysis2------------------------------------- +dX = as.matrix(cbind(d,X)) +doublesel.effect = rlassoEffect(x=X, y=y, d=d, method="double selection") +summary(doublesel.effect) + +## ----summary_results, results="hide"---------------------- +library(xtable) +table= rbind(summary(ls.effect)$coef["gdpsh465",1:2], + summary(lasso.effect)$coef[,1:2], + summary(doublesel.effect)$coef[,1:2]) +colnames(table)= c("Estimate", "Std. Error") #names(summary(full.fit)$coef)[1:2] +rownames(table)= c("full reg via ols", "partial reg +via post-lasso ", "partial reg via double selection") +tab= xtable(table, digits=c(2, 2,5)) + +## ----results="asis"--------------------------------------- +tab + +## ----AJR_processing--------------------------------------- +data(AJR); y = AJR$GDP; d = AJR$Exprop; z = AJR$logMort +x = model.matrix(~ -1 + (Latitude + Latitude2 + Africa + + Asia + Namer + Samer)^2, data=AJR) +dim(x) + +## ----AJR_analysis----------------------------------------- +#AJR.Xselect = rlassoIV(x=x, d=d, y=y, z=z, select.X=TRUE, select.Z=FALSE) +AJR.Xselect = rlassoIV(GDP ~ Exprop + (Latitude + Latitude2 + Africa + + Asia + Namer + Samer)^2 | logMort + (Latitude + Latitude2 + Africa + + Asia + Namer + Samer)^2, data=AJR, select.X=TRUE, select.Z=FALSE) +summary(AJR.Xselect) +confint(AJR.Xselect) + +## ----AJR_partiallingout_ols------------------------------- +# parialling out by linear model +fmla.y = GDP ~ (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2 +fmla.d = Exprop ~ (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2 +fmla.z = logMort ~ (Latitude + Latitude2 + Africa + Asia + Namer + Samer)^2 +rY = lm(fmla.y, data = AJR)$res +rD = lm(fmla.d, data = AJR)$res +rZ = lm(fmla.z, data = AJR)$res +#ivfit.lm = tsls(y=rY,d=rD, x=NULL, z=rZ, intercept=FALSE) +ivfit.lm = tsls(rY ~ rD | rZ, intercept = FALSE) +print(cbind(ivfit.lm$coef, ivfit.lm$se),digits=3) + +## ----AJR_partiallingout_lasso----------------------------- +# parialling out by lasso +rY = rlasso(fmla.y, data = AJR)$res +rD = rlasso(fmla.d, data = AJR)$res +rZ = rlasso(fmla.z, data = AJR)$res +#ivfit.lasso = tsls(y=rY,d=rD, x=NULL, z=rZ, intercept=FALSE) +ivfit.lasso = tsls(rY ~ rD | rZ, intercept = FALSE) +print(cbind(ivfit.lasso$coef, ivfit.lasso$se), digits=3) + +## ----lassoIV---------------------------------------------- +data(EminentDomain) +z <- as.matrix(EminentDomain$logGDP$z) +x <- as.matrix(EminentDomain$logGDP$x) +y <- EminentDomain$logGDP$y +d <- EminentDomain$logGDP$d +x <- x[,apply(x, 2, mean, na.rm=TRUE) > 0.05] # +z <- z[,apply(z, 2, mean, na.rm=TRUE) > 0.05] # + +## ----OLS_EminentDomina------------------------------------ +ED.ols = lm(y~cbind(d,x)) +ED.2sls = tsls(y=y, d=d, x=x, z=z[,1:2], intercept=FALSE) + +## ----ED_analysis------------------------------------------ +lasso.IV.Z = rlassoIV(x=x, d=d, y=y, z=z, select.X=FALSE, select.Z=TRUE) +# or lasso.IV.Z = rlassoIVselectZt(x=X, d=d, y=y, z=z) +summary(lasso.IV.Z) +confint(lasso.IV.Z) + +## ----lassoIV_analysis------------------------------------- +lasso.IV.XZ = rlassoIV(x=x, d=d, y=y, z=z, select.X=TRUE, select.Z=TRUE) +summary(lasso.IV.XZ) +confint(lasso.IV.XZ) + +## ----summary_results_ED, include=TRUE-------------------- +library(xtable) +table= matrix(0, 4, 2) +table[1,]= summary(ED.ols)$coef[2,1:2] +table[2,]= cbind(ED.2sls$coef[1], ED.2sls$se[1]) +table[3,]= summary(lasso.IV.Z)[,1:2] +table[4,]= summary(lasso.IV.XZ)[,1:2] +colnames(table)= c("Estimate", "Std. Error") +rownames(table)= c("ols regression", "IV estimation ", "selection on Z", "selection on X and Z") +tab= xtable(table, digits=c(2, 2,7)) + +## ----results="asis"--------------------------------------- +tab + +## ----401_processing--------------------------------------- +data(pension) +y = pension$tw; d = pension$p401; z = pension$e401 +X = pension[,c("i2", "i3", "i4", "i5", "i6", "i7", "a2", "a3", "a4", "a5", + "fsize", "hs", "smcol", "col", "marr", "twoearn", "db", "pira", "hown")] # simple model +xvar = c("i2", "i3", "i4", "i5", "i6", "i7", "a2", "a3", "a4", "a5", + "fsize", "hs", "smcol", "col", "marr", "twoearn", "db", "pira", "hown") +xpart = paste(xvar, collapse = "+") +form <- as.formula(paste("tw ~ ", paste(c("p401", xvar), collapse ="+"), "|", +paste(xvar, collapse = "+"))) +formZ <- as.formula(paste("tw ~ ", paste(c("p401", xvar), collapse ="+"), "|", +paste(c("e401", xvar), collapse = "+"))) + +## ----401_analysis_ATE------------------------------------- +#pension.ate = rlassoATE(X,d,y) +pension.ate = rlassoATE(form, data = pension) +summary(pension.ate) +#pension.atet = rlassoATET(X,d,y) +pension.atet = rlassoATET(form, data = pension) +summary(pension.atet) + +## ----401_analysis_LATE------------------------------------ +pension.late = rlassoLATE(X,d,y,z, always_takers = FALSE) +#pension.late = rlassoLATE(formZ, data=pension, always_takers = FALSE) +summary(pension.late) +pension.latet = rlassoLATET(X,d,y,z, always_takers = FALSE) +#pension.latet = rlassoLATET(formZ, data=pension, always_takers = FALSE) +summary(pension.latet) + +## ----summary_pension, include=TRUE------------------------ +library(xtable) +table= matrix(0, 4, 2) +table[1,]= summary(pension.ate)[,1:2] +table[2,]= summary(pension.atet)[,1:2] +table[3,]= summary(pension.late)[,1:2] +table[4,]= summary(pension.latet)[,1:2] +colnames(table)= c("Estimate", "Std. Error") +rownames(table)= c("ATE", "ATET ", "LATE", "LATET") +tab= xtable(table, digits=c(2, 2,2)) + +## ----results="asis",echo=FALSE---------------------------- +tab + +## ----401_processing_interaction--------------------------- + +# generate all interactions of X's +xvar2 = paste("(", paste(xvar, collapse = "+"), ")^2", sep="") + +# ATE and ATE with interactions +forminteract = formula(paste("tw ~", xvar2, " + p401", + "|", + xvar2, sep="")) +# LATE and LATET with interactions +formZinteract = formula(paste("tw ~", xvar2, " + p401", + "|", + xvar2, " + e401", sep="")) + +## ----401_analysis_interaction, results="hide", echo="FALSE"---- +# pension.ate= rlassoATE(forminteract, data = pension) +# summary(pension.ate) +# +# pension.atet= rlassoATET(forminteract, data = pension) +# summary(pension.atet) +# +# pension.late= rlassoLATE(formZinteract, data = pension, always_takers = FALSE) +# summary(pension.late) +# +# pension.latet= rlassoLATET(formZinteract, data = pension, always_takers = FALSE) +# summary(pension.latet) +# +# table= matrix(0, 4, 2) +# table[1,]= summary(pension.ate)[,1:2] +# table[2,]= summary(pension.atet)[,1:2] +# table[3,]= summary(pension.late)[,1:2] +# table[4,]= summary(pension.latet)[,1:2] +# colnames(table)= c("Estimate", "Std. Error") +# rownames(table)= c("ATE", "ATET ", "LATE", "LATET") +# tab= xtable(table, digits=c(2, 2,2)) + +## ----results="asis",echo=FALSE---------------------------- +# tab + +## ----lasso------------------------------------------------ +#library(hdm) +#library(stats) +set.seed(1) +n = 100 +p1 = 20 +p2= 20 +D= matrix(rnorm(n*p1), n, p1) # Causes +W= matrix(rnorm(n*p2), n, p2) # Controls +X = cbind(D,W) # Regressors +Y = D[,1]*5 + W[,1]*5 + rnorm(n) #Outcome +confint(rlassoEffects(X,Y, index=c(1:p1)), joint=TRUE) +#BCK Joint Confidence Band for Reg Coefficients 1 to 20 + +## ----siminf----------------------------------------------- +library(mvtnorm) +set.seed(1) +n = 100 +p = 80 +s = 9 +covar=toeplitz(0.9^(0:(p-1))) +diag(covar) = rep(1,p) +mu = rep(0,p) +X = mvtnorm::rmvnorm(n=n, mean=mu, sigma=covar) # Regressors +beta = c(s:1, rep(0, p-s)) +Y = X%*% beta + rnorm(n, sd = 5) #Outcome +# Estimate rlassoEffects +rl = rlassoEffects(X,Y, index=c(1:p)) + +# unadjusted +p.unadj = p_adjust(rl, method = "none") +# Number of rejections at a prespecified significance level +sum(p.unadj[,2]<0.05) + +## ----siminf2---------------------------------------------- +# Romano-Wolf Stepdown Correction +p.rw = p_adjust(rl, method = "RW", B = 1000) +# Number of rejections at a prespecified significance level (5%) +sum(p.rw[,2]<0.05) + +## ----siminf3---------------------------------------------- +# Adjust with Bonferroni correction +p.bonf = p_adjust(rl, method = "bonferroni") +# Number of rejections at a prespecified significance level +sum(p.bonf[,2]<0.05) + +# Romano-Wolf Stepdown Correction +p.bh = p_adjust(rl, method = "BH") +# Number of rejections at a prespecified significance level +sum(p.bh[,2]<0.05) + +## ----results="hide"--------------------------------------- +data(pension) + +## ----eval=FALSE------------------------------------------- +# help(pension) + +## ----results="hide"--------------------------------------- +data(GrowthData) + +## ----results="hide"--------------------------------------- +data(AJR) + +## ----results="hide"--------------------------------------- +data(EminentDomain) + +## ----results="hide"--------------------------------------- +data(BLP) + +## ----results="hide"--------------------------------------- +data(cps2012) +