diff --git a/DESCRIPTION b/DESCRIPTION index 15737e3..fae0a71 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,12 +2,13 @@ Package: glmmPen Type: Package Title: High Dimensional Penalized Generalized Linear Mixed Models (pGLMM) -Version: 1.5.2.11 -Date: 2022-12-12 +Version: 1.5.3.0 +Date: 2023-03-14 Authors@R: c( person("Hillary", "Heiling", email = "hheiling@live.unc.edu", role = c("aut", "cre")), - person("Naim", "Rashid", email = "naim@unc.edu", role = c("aut")), - person("Quefeng", "Li", email = "quefeng@email.unc.edu", role = c("aut"))) + person("Naim", "Rashid", email = "nur2@email.unc.edu", role = c("aut")), + person("Quefeng", "Li", email = "quefeng@email.unc.edu", role = c("aut")), + person("Joseph", "Ibrahim", email = "ibrahim@bios.unc.edu", role = c("aut"))) Maintainer: Hillary Heiling Description: Fits high dimensional penalized generalized linear mixed models using @@ -33,10 +34,10 @@ Imports: ncvreg, reshape2, rstan (>= 2.18.1), - rstantools (>= 2.0.0), stringr, mvtnorm, - MASS + MASS, + coxme Depends: lme4, bigmemory, @@ -55,7 +56,8 @@ NeedsCompilation: yes Packaged: 2019-01-25 20:03:59 UTC; hheiling Author: Hillary Heiling [aut, cre], Naim Rashid [aut], - Quefeng Li [aut] + Quefeng Li [aut], + Joseph Ibrahim [aut] Suggests: testthat, knitr, diff --git a/NAMESPACE b/NAMESPACE index ede802f..e83ab88 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -20,6 +20,7 @@ S3method(sigma,pglmmObj) S3method(summary,pglmmObj) export(LambdaSeq) export(adaptControl) +export(coxphControl) export(glmm) export(glmmPen) export(glmmPen_FA) @@ -43,6 +44,8 @@ importFrom(bigmemory,big.matrix) importFrom(bigmemory,describe) importFrom(bigmemory,read.big.matrix) importFrom(bigmemory,write.big.matrix) +importFrom(coxme,VarCorr) +importFrom(coxme,coxme) importFrom(lme4,VarCorr) importFrom(lme4,factorize) importFrom(lme4,findbars) diff --git a/R/E_step.R b/R/E_step.R index 5346fab..586d410 100644 --- a/R/E_step.R +++ b/R/E_step.R @@ -4,7 +4,8 @@ # only use fixed effects # ranef_idx: index of which random effects are non-zero as of latest M-step (will not # sample from random effects that have been penalized out) -# y: response +# y: response. If coxph, y = event indicator +# y_times: If coxph, value of observed times (NULL if not coxph family) # X: fixed effects covariates # Znew2: For each group k (k = 1,...,d), Znew2 = Z * Gamma (Gamma = t(chol(sigma))) # group: group index @@ -18,14 +19,22 @@ # d: total number groups # uold: posterior sample to use for initialization of E-step # proposal_SD, batch, batch_length, offset_increment: arguments for adaptive random walk +# coxph_options: for Cox Proportional Hazards model, additional parameters of interest #' @importFrom bigmemory attach.big.matrix describe big.matrix #' @importFrom rstan sampling extract -E_step = function(coef, ranef_idx, y, X, Znew2, group, offset_fit, - nMC, nMC_burnin, family, link, phi, sig_g, +E_step = function(coef, ranef_idx, y, y_times = NULL, X, Znew2, group, offset_fit, + nMC, nMC_burnin, family, link, phi = 0.0, sig_g = 1.0, sampler, d, uold, proposal_SD, batch, batch_length, - offset_increment, trace){ + offset_increment, trace, coxph_options = NULL){ + + if((family == "coxph") & !(sampler == "stan")){ + stop("'coxph' family currently only supports the 'stan' sampler") + } + if((family == "coxph") & (is.null(coxph_options))){ + stop("coxph_options must be of class 'coxphControl' (see coxphControl() documentation) for the 'coxph' family") + } gibbs_accept_rate = matrix(NA, nrow = d, ncol = nrow(Znew2)/d) @@ -82,7 +91,7 @@ E_step = function(coef, ranef_idx, y, X, Znew2, group, offset_fit, print(gibbs_accept_rate) } - }else if(sampler == "stan"){ + }else if((sampler == "stan") & (family != "coxph")){ u0 = big.matrix(nrow = nMC, ncol = ncol(Znew2), init=0) # use ', init = 0' for sampling within EM algorithm @@ -99,10 +108,6 @@ E_step = function(coef, ranef_idx, y, X, Znew2, group, offset_fit, # Number draws to extract in each chain (after burn-in) nMC_chain = nMC - # Record last elements of each chain for initialization of next E step - ## Restriction: single chain for each E-step - last_draws = matrix(0, nrow = 1, ncol = ncol(Znew2)) - # For each group, sample from posterior distribution (sample the alpha values) for(k in 1:d){ idx_k = which(group == k) @@ -121,16 +126,15 @@ E_step = function(coef, ranef_idx, y, X, Znew2, group, offset_fit, q = length(ranef_idx), # number random effects (or common factors) eta_fef = as.array(as.numeric(X_k %*% matrix(coef[1:ncol(X)], ncol = 1)) + offset_fit[idx_k]), # fixed effects componenet of linear predictor y = as.array(y_k), # outcomes for group k - Z = Z_k) # portion of Z matrix corresonding to group k + Z = Z_k) # portion of Z matrix corresponding to group k }else{ # length(idx_k) > 1 dat_list = list(N = length(idx_k), # Number individuals in group k q = length(ranef_idx), # number random effects eta_fef = as.numeric(X_k %*% matrix(coef[1:ncol(X)], ncol = 1) + offset_fit[idx_k]), # fixed effects componenet of linear predictor y = y_k, # outcomes for group k - Z = Z_k) # portion of Z matrix corresonding to group k + Z = Z_k) # portion of Z matrix corresponding to group k } - if(family == "gaussian"){ dat_list$sigma = sig_g # standard deviation of normal dist of y @@ -150,7 +154,8 @@ E_step = function(coef, ranef_idx, y, X, Znew2, group, offset_fit, init_lst[[1]] = list(alpha = uold[cols_use]) } - # Avoid excessive warnings when nMC_chain is low in early EM iterations + # Sampling step + # suppressWarnings(): Avoid excessive warnings when nMC_chain is low in early EM iterations stan_fit = suppressWarnings(rstan::sampling(stan_file, data = dat_list, init = init_lst, iter = nMC_chain + nMC_burnin, warmup = nMC_burnin, show_messages = FALSE, refresh = 0, @@ -165,10 +170,6 @@ E_step = function(coef, ranef_idx, y, X, Znew2, group, offset_fit, draws_mat = matrix(stan_out[,1], ncol = 1) } - - # Find last elements for each chain for initialization of next E step - last_draws[1,cols_use] = draws_mat[nMC_chain,] - if(nrow(draws_mat) == nMC){ u0[,cols_use] = draws_mat }else{ # nrow(draws_mat) > nMC due to ceiling function in 'iter' specification @@ -180,6 +181,129 @@ E_step = function(coef, ranef_idx, y, X, Znew2, group, offset_fit, } # End k for loop + }else if((sampler == "stan") & (family == "coxph")){ + + # Alternative approach: https://rpubs.com/kaz_yos/surv_stan_piecewise1 + + # Cox Proportional Hazards family: Calculate cut-points to use for time intervals + ## Divide the timeline into J = cut_num intervals such that there are an equal + ## (or approximately equal) number of events in each interval + ## Note: must have at least one event in each interval (preferably > 2) to be identifiable + cut_num = coxph_options$cut_num + event_total = sum(y) + event_idx = which(y == 1) + # Determine number of events per time interval, event_j + if((event_total %% cut_num) == 0){ # event_total is a factor of cut_num + event_cuts = rep(event_total / cut_num, times = cut_num) + }else{ + tmp = event_total %/% cut_num + event_cuts = rep(tmp, times = cut_num) + for(j in 1:(event_total - tmp*cut_num)){ + event_cuts[j] = event_cuts[j] + 1 + } + } + + # warning if only 1 event for an interval, stop if 0 events for an interval + if(any(event_cuts == 1)){ + warning("at least one time interval for the piecewise exponential hazard model has only 1 event, ", + "please see the coxphControl() documentation for details and tips on how to fix the issue", + immediate. = TRUE) + }else if(any(event_cuts == 0)){ + stop("at least one time interval for the piecewise exponential hazard model has 0 events, ", + "please see the coxphControl() documentation for details and tips on how to fix the issue") + } + + cut_pts_idx = numeric(cut_num) + for(j in 1:cut_num){ + cut_pts_idx[j] = event_idx[sum(event_cuts[1:j])] + } + + cut_points = numeric(cut_num) + for(j in 1:(cut_num-1)){ + cut_points[j] = mean(y_times[cut_pts_idx[j]], y_times[cut_pts_idx[j]+1]) + } + cut_points[cut_num] = max(y_times) + 1 + # cut_points = y_times[cut_pts_idx] + + u0 = big.matrix(nrow = nMC, ncol = ncol(Znew2) + length(cut_points), init=0) # use ', init = 0' for sampling within EM algorithm + + stan_file = stanmodels$coxph_piecewise_exp_model + + # Number draws to extract in each chain (after burn-in) + nMC_chain = nMC + + # If necessary, restrict columns of Znew2 matrix to columns associated with non-zero + # latent variables (random effects / latent common factors) + # Also determine relevant rows of u0 matrix to save alpha samples + cols_analyze = NULL + for(k in 1:d){ + cols_k = seq(from = k, to = ncol(Znew2), by = d) + cols_analyze = c(cols_analyze,cols_k[ranef_idx]) + } + cols_analyze = cols_analyze[order(cols_analyze)] + + # Indicator matrix: + ## For subject i, determine which columns of the Znew2 matrix are relevant for analyses + ## In other words, if subject i in group k, indicate which rows of Znew2 matrix associated with group k + I_mat = matrix(0, nrow = nrow(Znew2), ncol = ncol(Znew2)) + for(k in 1:d){ + idx_k = which(group == k) + cols_k = seq(from = k, to = ncol(Znew2), by = d) + I_mat[idx_k,cols_k] = 1 + } + + # Sample the random effects / latent common factors 'alpha': group-specific values needed + # Also sample log-hazard values 'lhaz' for each time interval + ## As opposed to other families, sample all (d*q) random effects / (d*r) latent common factors + ## together instead of sampling by group. Reasoning: want log-hazard values to be + ## consistent regardless of group identity + dat_list = list(N = length(y), # Number of observations + NT = length(cut_points), # Number of time intervals + H = length(ranef_idx)*d, # Number groups times number latent variables (latent random effects or latent common factors) + eta_fef = as.numeric(X %*% matrix(coef[1:ncol(X)], ncol = 1) + offset_fit), # Fixed effects portion of linear predictor + y = y, # event indicator (1 = event, 0 = censor) + obs_t = y_times, # observed times + Z = Znew2[,cols_analyze], # Z * Gamma or Z * B matrix, see calculation in fit_dat_coxph + cutpt = c(0, cut_points), # Time interval boundaries, including 0 as lower bound of first interval + I = I_mat[,cols_analyze], # Indicator matrix, see above calculation + lhaz_prior = coxph_options$lhaz_prior) # Specifies standard deviation of normal prior + + # initialize posterior random draws + alpha_idx = cols_analyze + lhaz_idx = (ncol(Znew2)+1):length(uold) + # init: See "rstan::stan" documentation + ## Set initial values by providing a list equal in length to the number of chains (1). + ## The elements of this list should themselves be named lists, where each of these + ## named lists has the name of a parameter and is use to specify the initial values for + # that parameter for the corresponding chain + init_lst = list() + init_lst[[1]] = list(alpha = uold[alpha_idx], + lhaz = uold[lhaz_idx]) + + # Sampling step + # suppressWarnings(): Avoid excessive warnings when nMC_chain is low in early EM iterations + stan_fit = suppressWarnings(rstan::sampling(stan_file, data = dat_list, init = init_lst, + iter = nMC_chain + nMC_burnin, + warmup = nMC_burnin, show_messages = FALSE, refresh = 0, + chains = 1, cores = 1)) + + stan_out = as.matrix(stan_fit) + # Check organization of samples + # print(colnames(stan_out)) # first alpha samples, then lhaz samples, then lp__ value + # Exclude lp__ column of output (log density up to a constant) + samp_idx = 1:(length(cols_analyze) + length(cut_points)) + draws_mat = stan_out[,samp_idx] + # Specify column locations of u0 matrix to save samples from stan_fit object + u0_idx = c(cols_analyze, ((1:length(cut_points))+ncol(Znew2))) + + if(nrow(draws_mat) == nMC){ + u0[,u0_idx] = draws_mat + }else{ # nrow(draws_mat) > nMC due to ceiling function in 'iter' specification + start_row = nrow(draws_mat) - nMC + 1 + rows_seq = start_row:nrow(draws_mat) + u0[,u0_idx] = draws_mat[rows_seq,] + } + } # End if-else sampler return(list(u0 = describe(u0), proposal_SD = proposal_SD, gibbs_accept_rate = gibbs_accept_rate, diff --git a/R/M_step.R b/R/M_step.R index 9eeb345..bd837bf 100644 --- a/R/M_step.R +++ b/R/M_step.R @@ -61,7 +61,7 @@ M_step = function(y, X, Z, u_address, M, J, group, family, link_int, coef, offse K = numeric(J_XZ) for(j in unique(XZ_group)){ idx = which(XZ_group == j) - K[j+1] = length(idx) + K[j+1] = length(idx) # Add 1 because smallest XZ_group value is 0 } # Number of groups wrt observations diff --git a/R/control_options.R b/R/control_options.R index 214d518..2d0fa0c 100644 --- a/R/control_options.R +++ b/R/control_options.R @@ -282,6 +282,72 @@ adaptControl = function(batch_length = 100, offset = 0){ class = "adaptControl") } +#' @title Control of Cox Proportional Hazards Model Fitting +#' +#' @description Constructs the control structure for additional parameters needed for +#' the sampling and optimization routines involving the Cox Proportional Hazards model fit algorithm +#' +#' @param cut_num positive integer specifying the number of time intervals to include in +#' the piecewise exponential hazard model assumptions for the sampling step. Default is 8. +#' General recommendation: use between 5 and 10 intervals. See the Details section for +#' additional information. +#' @param lhaz_prior positive numeric value specifying the standard deviation of the +#' multivariate normal prior for the log of the baseline hazard values for each time interval. +#' Default is 3. If encounter convergence issues, the user can consider +#' increasing or decreasing this value (e.g. increase to 4 or decrease to 2 ...). +#' +#' @return Function returns a list inheriting from class \code{optimControl} +#' containing fit and optimization criteria values used in optimization routine. +#' +#' @details In the piecewise exponential hazard model assumption---which is assumed in the +#' sampling step (E-step) for the Cox PH family---there is an assumption that the +#' time line of the data can be cut into \code{cut_num} +#' time intervals and the baseline hazard is constant within +#' each of these time intervals. In the sampling step, we need to estimate +#' these baseline hazard values (specifically, we estimate the log of the baseline +#' hazard values). We determine cut points by specifying the total number of cuts +#' to make (\code{cut_num}) and then specifying time values for cut points such +#' that each time interval has an equal number (or approximately equal number) +#' of events. Each time interval must have at least one event for the model +#' to be identifiable, but more events per time interval is better. +#' Consequently, having too many cut points could result in (i) not having enough +#' events for each time interval and/or (ii) significantly slowing down the +#' sampling step due to requiring the estimation of many log baseline hazard values. +#' Additionally, data with few events could result too few events per time interval +#' even for a small number of cut points. We generally recommend having +#' 8 total time intervals (more broadly, between 5 and 10). Warnings or errors +#' will occur for cases when there are 1 or 0 events for a time interval. +#' If this happens, either adjust the \code{cut_num} value appropriately, +#' or in the case when the data simply has a very small number of events, +#' consider not using this software for your estimation purposes. +#' +#' @export +coxphControl = function(cut_num = 8, lhaz_prior = 3){ + + ######################################################################################### + # Input checks and restrictions + ######################################################################################### + + # cut_num + if((floor(cut_num) != cut_num) | (cut_num < 1)){ + stop("cut_num must be a positive integer") + } + + if((cut_num < 5) | (cut_num > 10)){ + warning("the glmmPen team recommends that you keep cut_num between 5 and 10; 8 is typically a good cut_num value", immediate. = TRUE) + } + + # lhaz_prior + if(lhaz_prior <= 0){ + stop("lhaz_prior must be a positive numeric value") + } + + # output object + structure(list(cut_num = cut_num, lhaz_prior = lhaz_prior), + class = "coxphControl") + +} + #' @title Control of Penalized Generalized Linear Mixed Model Fitting #' #' @description Constructs the control structure for the optimization of the penalized mixed model fit algorithm. diff --git a/R/fit_dat.R b/R/fit_dat.R index 628f216..e2814c2 100644 --- a/R/fit_dat.R +++ b/R/fit_dat.R @@ -322,7 +322,7 @@ fit_dat = function(dat, lambda0 = 0, lambda1 = 0, }else{ vars = var_start cov = var = matrix(vars, ncol = 1) - gamma = matrix(sqrt(var), ncol = 1) + gamma = matrix(sqrt(vars), ncol = 1) } if(trace >= 1){ @@ -569,8 +569,10 @@ fit_dat = function(dat, lambda0 = 0, lambda1 = 0, out$warnings = "Error in M step: coefficient values diverged" } }else if(randInt_issue == 1){ - warning("Error in model fit: Random intercept variance is too small, indicating that this model \n - should be fit using traditional generalized linear model techniques.", immediate. = TRUE) + warning("Error in model fit: Random intercept variance is too small, indicating either that + there are high correlations among the covariates (if so, consider reducing these correlations + or changing the Elastic Net alpha value) or that this model should be fit + using traditional generalized linear model techniques.", immediate. = TRUE) out$warnings = "Error in model fit: random intercept variance becomes too small, model should be fit using regular generalized linear model techniques" } @@ -661,7 +663,7 @@ fit_dat = function(dat, lambda0 = 0, lambda1 = 0, Znew2[group == j,seq(j, ncol(Z), by = d)] = Z[group == j,seq(j, ncol(Z), by = d)]%*%gamma } - # Initial points for Metropolis within Gibbs E step algorithms + # Initial points for E step sampling algorithms uold = as.numeric(u0[nrow(u0),]) # if random effect penalized out in past model / in previous M-step, do not # collect posterior samples for this random effect diff --git a/R/formula_data_edits.R b/R/formula_data_edits.R index 46da572..6ec6543 100644 --- a/R/formula_data_edits.R +++ b/R/formula_data_edits.R @@ -100,7 +100,7 @@ checkXmatrix = function(X){ if (typeof(X)=="character") stop("input variables must be numeric", call.=FALSE) # Make sure individuals did not input an additional intercept or a variable with only one value # for all observations - col_vars = apply(X[,-1,drop=F], 2, var, na.rm=TRUE) + col_vars = apply(X[,-1,drop=FALSE], 2, var, na.rm=TRUE) if(any(col_vars == 0)){ # If true, more than one column with zero variance # If only one column, class(col_vars) == "numeric" diff --git a/R/glmmPen.R b/R/glmmPen.R index 8427f95..d235c3a 100644 --- a/R/glmmPen.R +++ b/R/glmmPen.R @@ -1141,19 +1141,26 @@ XZ_std = function(fD_out){ #' @inheritParams lambdaControl #' @param X matrix of standardized fixed effects (see \code{std} function in \code{ncvreg} #' documenation). X should not include intercept. -#' @param y numeric vector of response values +#' @param y numeric vector of response values. If "coxph" family, \code{y} are the event indicator +#' values (0 if censored, 1 if event) +#' @param y_times numeric vector of observed times for "coxph" family; \code{NULL} for all +#' other families #' @param nlambda positive integer specifying number of penalty parameters (lambda) with #' which to fit a model. #' @param penalty.factor an optional numeric vector equal to the \code{fixef_noPen} argument #' in \code{\link{glmmPen}} #' +#' @details If the family is "coxph", the \code{y}, \code{y_times}, and \code{X} must be +#' sorted such that the subjects' \code{y_times} are sorted from the smallest to the largest times. +#' The "coxph" family procedure is still in production and not yet ready. +#' #' @return numeric sequence of penalty parameters of length \code{nlambda} ranging from the #' minimum penalty parameter (first element) equal to fraction \code{lambda.min} multiplied by the #' maximum penalty parameter to the maximum penalty parameter (last element) #' -#' @importFrom ncvreg setupLambda +#' @importFrom ncvreg setupLambda #' @export -LambdaSeq = function(X, y, family, alpha = 1, lambda.min = NULL, nlambda = 10, +LambdaSeq = function(X, y, y_times = NULL, family, alpha = 1, lambda.min = NULL, nlambda = 10, penalty.factor = NULL){ # Checks if(!is.matrix(X)){ @@ -1192,9 +1199,26 @@ LambdaSeq = function(X, y, family, alpha = 1, lambda.min = NULL, nlambda = 10, penalty.factor = rep(1, p) } - # setupLambda from ncvreg package + # setupLambda and setupLambdaCox from ncvreg package ## Order: from max lambda to min lambda - lambda = setupLambda(X, yy, family, alpha, lambda.min, nlambda, penalty.factor) + if(family != "coxph"){ + lambda = setupLambda(X, yy, family, alpha, lambda.min, nlambda, penalty.factor) + }else if(family == "coxph"){ + stop("LambdaSeq is not yet set up for Cox Proportional Hazards (coxph) family") + # if(is.null(y_times)){ + # stop("y_times must be given for the 'coxph' family") + # } + # if(!all(unique(y) %in% c(0,1))){ + # stop("y must be the event indicator (0 vs 1) for the 'coxph' family") + # } + # if(all.equal(y_times, y_times[order(y_times)])){ + # lambda = setupLambdaCox(X, y_times, y, alpha, lambda.min, nlambda, penalty.factor) + # }else{ + # stop("observations for the 'coxph' family must be sorted by y_times (smalles to largest)") + # } + + } + # reverse the order of the lambda - from min lambda to max lambda lambda_rev = lambda[order(lambda)] diff --git a/R/methods.R b/R/methods.R index 0d35fe0..954196e 100644 --- a/R/methods.R +++ b/R/methods.R @@ -9,7 +9,7 @@ #' #' @importFrom lme4 fixef #' @export -fixef.pglmmObj = function(object){ +fixef.pglmmObj = function(object, ...){ object$fixef } @@ -18,10 +18,9 @@ fixef.pglmmObj = function(object){ #' #' @importFrom lme4 ranef #' @export -ranef.pglmmObj = function(object){ +ranef.pglmmObj = function(object, ...){ object$ranef } - #' @describeIn pglmmObj Provides the random effect covariance matrix. If family is Gaussian, #' also returns the standard deviation of the residual error. #' diff --git a/R/select_tune.R b/R/select_tune.R index e81a6f3..426542c 100644 --- a/R/select_tune.R +++ b/R/select_tune.R @@ -342,6 +342,11 @@ select_tune = function(dat, offset = NULL, family, covar = c("unstructured","ind if(!is.numeric(BIC_crit)) BIC_crit = Inf if(length(BIC_crit) != 1) BIC_crit = Inf + if(is.na(BIC_crit)){ + stop("Model selection criteria ", BIC_option, " calculated as NA due to issues with model fit, + stopping variable selection proceedure") + } + if(BIC_crit < BIC_critold){ fout = out BIC_critold = BIC_crit diff --git a/R/sim_generation.R b/R/sim_generation.R index 67561f3..3a6f2a7 100644 --- a/R/sim_generation.R +++ b/R/sim_generation.R @@ -21,8 +21,8 @@ #' random effects covariance matrix (applied to the non-zero random effects) #' @param family character string specifying which family to generate data from. #' Family options include "binomial" (default), "poisson", and "gaussian". -#' @param corr optional value to specify correlation in the random effects -#' covariance matrix. Default \code{NULL}, only available within \code{sim.data}. +#' @param corr optional value to specify correlation between covariates +#' in the model matrix. Default \code{NULL}, only available within \code{sim.data}. #' @param seed integer to use for the setting of a random seed #' @param imbalance integer of 0 or 1 indicating whether the observations should #' be equally distributed among the groups (0) or unequally distributed (1). @@ -83,7 +83,7 @@ sim.data = function(n, ptot, pnonzero, nstudies, sd_raneff = 1, family = "binomi #mat = matrix(rbinom(n*p, p = 0.5, size = 1), nrow = n) # now switching back to normal to have more resolution to show prediction performance }else{ cor = matrix(corr, p, p) - diag(cor) = sd_x + diag(cor) = (sd_x)^2 sigma = cor # 0.5*cor mat = rmvnorm(n = n , mean = rep(0,p), sigma = sigma) } diff --git a/R/var_start_init.R b/R/var_start_init.R index c944c20..8f4bf81 100644 --- a/R/var_start_init.R +++ b/R/var_start_init.R @@ -2,6 +2,7 @@ # data: list object containing response y and group vector 'group' # fam_fun: family function output from 'family_export' function #' @importFrom lme4 lmer glmer VarCorr +#' @importFrom coxme coxme VarCorr var_init = function(data, fam_fun){ y = data$y grp = data$group @@ -9,6 +10,10 @@ var_init = function(data, fam_fun){ int_only = lmer(formula = y ~ 1 + (1 | grp)) }else if(fam_fun$family %in% c("binomial","poisson")){ int_only = glmer(formula = y ~ 1 + (1 | grp), family = fam_fun) + }else if(fam_fun$family == "coxph"){ + y_times = data$y_times + y_status = data$y_status + int_only = coxme(formula = Surv(y_times, y_status) ~ (1 | grp)) } var_start = VarCorr(int_only)$grp[[1]] * 2.0 diff --git a/inst/stan/binomial_logit_model.stan b/inst/stan/binomial_logit_model.stan index d276756..d85e1ce 100644 --- a/inst/stan/binomial_logit_model.stan +++ b/inst/stan/binomial_logit_model.stan @@ -1,6 +1,6 @@ data { int N; // Number of observations in group k (N_k) - int q; // Number of random effect variables + int q; // Number of latent variables (glmmPen: number random effects; glmmPen_FA: number latent common factors) vector[N] eta_fef; // fixed effects portion of linear predictor for individuals in group k int y[N]; // Outcome values in group k matrix[N,q] Z; // Portion of Z * Gamma matrix for group k diff --git a/inst/stan/gaussian_identity_model.stan b/inst/stan/gaussian_identity_model.stan index 0a999e3..348e980 100644 --- a/inst/stan/gaussian_identity_model.stan +++ b/inst/stan/gaussian_identity_model.stan @@ -1,6 +1,6 @@ data { int N; // Number of observations in group k (N_k) - int q; // Number of random effect variables + int q; // Number of latent variables (glmmPen: number random effects; glmmPen_FA: number latent common factors) vector[N] eta_fef; // fixed effects portion of linear predictor for individuals in group k real y[N]; // Outcome values in group k matrix[N,q] Z; // Portion of Z * Gamma matrix for group k diff --git a/inst/stan/poisson_log_model.stan b/inst/stan/poisson_log_model.stan index 036dc5c..08f3c55 100644 --- a/inst/stan/poisson_log_model.stan +++ b/inst/stan/poisson_log_model.stan @@ -1,6 +1,6 @@ data { int N; // Number of observations in group k (N_k) - int q; // Number of random effect variables + int q; // Number of latent variables (glmmPen: number random effects; glmmPen_FA: number latent common factors) vector[N] eta_fef; // fixed effects portion of linear predictor for individuals in group k int y[N]; // Outcome values in group k matrix[N,q] Z; // Portion of Z * Gamma matrix for group k diff --git a/man/LambdaSeq.Rd b/man/LambdaSeq.Rd index d8df1fc..6f0c8f6 100644 --- a/man/LambdaSeq.Rd +++ b/man/LambdaSeq.Rd @@ -7,6 +7,7 @@ LambdaSeq( X, y, + y_times = NULL, family, alpha = 1, lambda.min = NULL, @@ -18,7 +19,11 @@ LambdaSeq( \item{X}{matrix of standardized fixed effects (see \code{std} function in \code{ncvreg} documenation). X should not include intercept.} -\item{y}{numeric vector of response values} +\item{y}{numeric vector of response values. If "coxph" family, \code{y} are the event indicator +values (0 if censored, 1 if event)} + +\item{y_times}{numeric vector of observed times for "coxph" family; \code{NULL} for all +other families} \item{family}{a description of the error distribution and link function to be used in the model. Currently, the \code{glmmPen} algorithm allows the Binomial ("binomial" or binomial()), @@ -57,3 +62,8 @@ maximum penalty parameter to the maximum penalty parameter (last element) Calculates the sequence of penalty parameters used in the model selection procedure. This function calls functions from package \code{ncvreg}. } +\details{ +If the family is "coxph", the \code{y}, \code{y_times}, and \code{X} must be +sorted such that the subjects' \code{y_times} are sorted from the smallest to the largest times. +The "coxph" family procedure is still in production and not yet ready. +} diff --git a/man/coxphControl.Rd b/man/coxphControl.Rd new file mode 100644 index 0000000..18ce9ff --- /dev/null +++ b/man/coxphControl.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/control_options.R +\name{coxphControl} +\alias{coxphControl} +\title{Control of Cox Proportional Hazards Model Fitting} +\usage{ +coxphControl(cut_num = 8, lhaz_prior = 3) +} +\arguments{ +\item{cut_num}{positive integer specifying the number of time intervals to include in +the piecewise exponential hazard model assumptions for the sampling step. Default is 8. +General recommendation: use between 5 and 10 intervals. See the Details section for +additional information.} + +\item{lhaz_prior}{positive numeric value specifying the standard deviation of the +multivariate normal prior for the log of the baseline hazard values for each time interval. +Default is 3. If encounter convergence issues, the user can consider +increasing or decreasing this value (e.g. increase to 4 or decrease to 2 ...).} +} +\value{ +Function returns a list inheriting from class \code{optimControl} +containing fit and optimization criteria values used in optimization routine. +} +\description{ +Constructs the control structure for additional parameters needed for +the sampling and optimization routines involving the Cox Proportional Hazards model fit algorithm +} +\details{ +In the piecewise exponential hazard model assumption---which is assumed in the +sampling step (E-step) for the Cox PH family---there is an assumption that the +time line of the data can be cut into \code{cut_num} +time intervals and the baseline hazard is constant within +each of these time intervals. In the sampling step, we need to estimate +these baseline hazard values (specifically, we estimate the log of the baseline +hazard values). We determine cut points by specifying the total number of cuts +to make (\code{cut_num}) and then specifying time values for cut points such +that each time interval has an equal number (or approximately equal number) +of events. Each time interval must have at least one event for the model +to be identifiable, but more events per time interval is better. +Consequently, having too many cut points could result in (i) not having enough +events for each time interval and/or (ii) significantly slowing down the +sampling step due to requiring the estimation of many log baseline hazard values. +Additionally, data with few events could result too few events per time interval +even for a small number of cut points. We generally recommend having +8 total time intervals (more broadly, between 5 and 10). Warnings or errors +will occur for cases when there are 1 or 0 events for a time interval. +If this happens, either adjust the \code{cut_num} value appropriately, +or in the case when the data simply has a very small number of events, +consider not using this software for your estimation purposes. +} diff --git a/man/pglmmObj-class.Rd b/man/pglmmObj-class.Rd index 5d39fcd..896f66c 100644 --- a/man/pglmmObj-class.Rd +++ b/man/pglmmObj-class.Rd @@ -40,9 +40,9 @@ \title{Class \code{pglmmObj} of Fitted Penalized Generalized Mixed-Effects Models for package \code{glmmPen}} \usage{ -\method{fixef}{pglmmObj}(object) +\method{fixef}{pglmmObj}(object, ...) -\method{ranef}{pglmmObj}(object) +\method{ranef}{pglmmObj}(object, ...) \method{sigma}{pglmmObj}(object, ...) diff --git a/man/sim.data.Rd b/man/sim.data.Rd index 87e4e86..5a75da6 100644 --- a/man/sim.data.Rd +++ b/man/sim.data.Rd @@ -29,6 +29,7 @@ sim.data.FA( family = "binomial", B = NULL, r = 2, + corr = NULL, seed, imbalance = 0, beta = NULL, @@ -53,8 +54,8 @@ random effects covariance matrix (applied to the non-zero random effects)} \item{family}{character string specifying which family to generate data from. Family options include "binomial" (default), "poisson", and "gaussian".} -\item{corr}{optional value to specify correlation in the random effects -covariance matrix. Default \code{NULL}, only available within \code{sim.data}.} +\item{corr}{optional value to specify correlation between covariates +in the model matrix. Default \code{NULL}, only available within \code{sim.data}.} \item{seed}{integer to use for the setting of a random seed} diff --git a/src/Makevars b/src/Makevars index 5c332f1..c7f1fce 100644 --- a/src/Makevars +++ b/src/Makevars @@ -10,9 +10,9 @@ ## support within Armadillo prefers / requires it STANHEADERS_SRC = `"$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "message()" -e "cat(system.file('include', 'src', package = 'StanHeaders', mustWork = TRUE))" -e "message()" | grep "StanHeaders"` -PKG_CPPFLAGS = -I"../inst/include" -I"$(STANHEADERS_SRC)" -DBOOST_DISABLE_ASSERTS -DEIGEN_NO_DEBUG -DBOOST_MATH_OVERFLOW_ERROR_POLICY=errno_on_error +PKG_CPPFLAGS = -I"../inst/include" -I"$(STANHEADERS_SRC)" -DBOOST_DISABLE_ASSERTS -DEIGEN_NO_DEBUG -DBOOST_MATH_OVERFLOW_ERROR_POLICY=errno_on_error -D_HAS_AUTO_PTR_ETC=0 -CXX_STD = CXX14 +CXX_STD = CXX17 PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) diff --git a/src/Makevars.win b/src/Makevars.win index 7db8640..1f8a644 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -10,9 +10,9 @@ ## support within Armadillo prefers / requires it STANHEADERS_SRC = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "message()" -e "cat(system.file('include', 'src', package = 'StanHeaders', mustWork = TRUE))" -e "message()" | grep "StanHeaders") -PKG_CPPFLAGS = -I"../inst/include" -I"$(STANHEADERS_SRC)" -DBOOST_DISABLE_ASSERTS -DEIGEN_NO_DEBUG +PKG_CPPFLAGS = -I"../inst/include" -I"$(STANHEADERS_SRC)" -DBOOST_DISABLE_ASSERTS -DEIGEN_NO_DEBUG -D_HAS_AUTO_PTR_ETC=0 -CXX_STD = CXX14 +CXX_STD = CXX17 PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) diff --git a/src/coord_descent.cpp b/src/coord_descent.cpp index 395e849..e2acc05 100644 --- a/src/coord_descent.cpp +++ b/src/coord_descent.cpp @@ -119,15 +119,7 @@ arma::vec coord_desc(arma::vec y, arma::mat X, arma::vec weights, arma::vec resi converged = 1; } - // if(trace >= 2){ - // Rprintf("Results at end of iteration %i \n", iter); - // Rcout << "beta" << beta.t() << std::endl; - // Rcout << "weights" << weights.t() << std::endl; - // Rcout << "residuals" << resid.t() << std::endl; - // // Rcout << "mu" << mu.t() << std::endl; - // // Rcout << "deriv" << deriv.t() << std::endl; - // // Rcout << "Vmu" << Vmu.t() << std::endl; - // } + } // End while loop diff --git a/src/grp_CD_XZ_FA_step.cpp b/src/grp_CD_XZ_FA_step.cpp index 19980a2..9f43fad 100644 --- a/src/grp_CD_XZ_FA_step.cpp +++ b/src/grp_CD_XZ_FA_step.cpp @@ -28,13 +28,13 @@ using namespace arma; // [[Rcpp::export]] arma::vec grp_CD_XZ_FA_step(const arma::vec& y, const arma::mat& X, const arma::mat& Z, - const arma::vec& group, - SEXP pBigMat, const arma::sp_mat& J_f, arma::vec dims, - arma::vec beta, const arma::vec& offset, - double step_size, double sig_g, - const char* family, int link, int init, double phi, - const arma::uvec& X_group, arma::uvec K, // covariate group index and size of covariate groups - const char* penalty, arma::vec params, int trace) { + const arma::vec& group, + SEXP pBigMat, const arma::sp_mat& J_f, arma::vec dims, + arma::vec beta, const arma::vec& offset, + double step_size, double sig_g, + const char* family, int link, int init, double phi, + const arma::uvec& X_group, arma::uvec K, // covariate group index and size of covariate groups + const char* penalty, arma::vec params, int trace) { // y = response vector @@ -61,11 +61,9 @@ arma::vec grp_CD_XZ_FA_step(const arma::vec& y, const arma::mat& X, const arma:: int d = dims(2); // number of groups within observations int q = dims(3); // number of random effect covariates int M = dims(4); // number of MCMC draws (nrow(u)) - // int J_XZ = dims(5); // number of covariate groups (in fixed and random effects) int J_X = dims(5); // number of covariate groups in fixed effects double conv = dims(6); // Convergence threshold int maxit = dims(7); // maximum number of iterations - // int J_X = XZ_group(p); // Covariate group corresponding to random intercept int Kj = 0; // Size of covariate group j @@ -356,8 +354,7 @@ arma::vec grp_CD_XZ_FA_step(const arma::vec& y, const arma::mat& X, const arma:: // Then update all other random effects. // ----------------------------------------------------------------------------------// - // Identify covariates belonging to random intercept (group j = J_X) - // arma::uvec idxj = find(XZ_group == J_X); + // Identify covariates belonging to random intercept for(f=0;f #include "utility_glm.h" + using namespace Rcpp; // Calculates the M residuals for an individual given eta, updates nu and resid @@ -144,41 +145,15 @@ double Qfun_quad_beta(double Q0, double step_size, const arma::mat& diff0, beta_diff = beta - beta0; term2 = sum(beta_diff % beta_diff); - Q_quad = Q0 - term1 / M + (0.5 * N / step_size) * term2 ; // 0.5 * final term? + Q_quad = Q0 - term1 / M + (0.5 * N / step_size) * term2 ; return(Q_quad); } -// // Quadratic approximation to Q function estimate (based on Taylor series expansion about -// // linear predictor evaluated at previous beta0 coefficient estimates) -// // Note: in M-step, have already calculated Q evaluated at past coefficient value (Q0) -// // Note: nu = 1 / step_size -// // [[Rcpp::export]] -// double Qfun_quad(double Q0, double nu, -// const arma::mat& diff0, const arma::mat& eta, const arma::mat& eta0){ -// -// int N = eta.n_cols; -// int M = eta.n_rows; -// -// int i = 0; -// -// double Q_quad = 0; -// double term1 = 0; -// double term2 = 0; -// -// arma::vec eta_diff(M); -// -// for(i=0;i