Skip to content

Commit

Permalink
Imputation beta implemented
Browse files Browse the repository at this point in the history
  • Loading branch information
ericbair-sciome committed Apr 4, 2024
1 parent 1f4bcc7 commit 6bafe5c
Show file tree
Hide file tree
Showing 35 changed files with 369 additions and 15 deletions.
Empty file modified .github/workflows/check-standard.yaml
100644 → 100755
Empty file.
Empty file modified .github/workflows/lint.yaml
100644 → 100755
Empty file.
Empty file modified .github/workflows/release.yaml
100644 → 100755
Empty file.
Empty file modified .github/workflows/sanitizers.yaml
100644 → 100755
Empty file.
Empty file modified .github/workflows/test-coverage.yaml
100644 → 100755
Empty file.
Empty file modified .lintr
100644 → 100755
Empty file.
49 changes: 43 additions & 6 deletions R/PrestoGP_Model.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ setOldClass("cv.glmnet")
#' "super matrix" for multivariate models. See
#' \code{\link[psych]{superMatrix}}.
#' @slot Y_train A column matrix containing the original response values.
#' @slot Y_bar A vector containing the means of each outcome.
#' @slot Y_obs A logical vector used to track which values of Y are non-missing.
#' @slot X_tilde The matrix of transformed predictors.
#' @slot y_tilde The column matrix containing the transformed response values.
#' @slot res numeric.
Expand Down Expand Up @@ -78,6 +80,8 @@ PrestoGPModel <- setClass("PrestoGPModel",
linear_model = "cv.glmnet", # the linear model
X_train = "matrix", # the original independent variable matrix
Y_train = "matrix", # the original dependent variable matrix
Y_bar = "numeric", # the means of each outcome variable
Y_obs = "logical", # a logical vector indicating whether the ith observation was observed
locs_train = "list", # the location / temporal matrix
converged = "logical", # a logical variable that is true if the model fitting process has converged
LL_Vecchia_krig = "numeric", # the value of the negative log likelihood function after optimization
Expand Down Expand Up @@ -115,7 +119,8 @@ setGeneric(
"prestogp_fit",
function(model, Y, X, locs, scaling = NULL, apanasovich = FALSE,
covparams = NULL, beta.hat = NULL, tol = 0.999999, max_iters = 100,
verbose = FALSE, optim.method = "Nelder-Mead",
center.y = NULL, impute.y = FALSE, lod = NULL, verbose = FALSE,
optim.method = "Nelder-Mead",
optim.control = list(trace = 0, reltol = 1e-3, maxit = 5000),
family = c("gaussian", "binomial"), nfolds = 10, foldid = NULL,
parallel = FALSE) {
Expand Down Expand Up @@ -214,11 +219,12 @@ setGeneric("compute_residuals", function(model, Y, Y.hat, family) standardGeneri
setGeneric("transform_data", function(model, Y, X) standardGeneric("transform_data"))
setGeneric("estimate_theta", function(model, locs, optim.control, method) standardGeneric("estimate_theta"))
setGeneric("estimate_betas", function(model, family, nfolds, foldid, parallel) standardGeneric("estimate_betas"))
setGeneric("impute_y", function(model) standardGeneric("impute_y"))
setGeneric("compute_error", function(model, y, X) standardGeneric("compute_error"))
setGeneric("scale_locs", function(model, locs) standardGeneric("scale_locs"))
setGeneric("theta_names", function(model) standardGeneric("theta_names"))
setGeneric("transform_covariance_parameters", function(model) standardGeneric("transform_covariance_parameters"))
setGeneric("check_input", function(model, Y, X, locs) standardGeneric("check_input"))
setGeneric("check_input", function(model, Y, X, locs, center.y, impute.y, lod) standardGeneric("check_input"))
setGeneric("check_input_pred", function(model, X, locs) standardGeneric("check_input_pred"))

#' show
Expand Down Expand Up @@ -329,6 +335,14 @@ setMethod(
#' tol*previous_error (optional). Defaults to 0.999999.
#' @param max_iters Maximum number of iterations for the model fitting
#' procedure. Defaults to 100.
#' @param center.y Should the Y's be mean centered before fitting the model?
#' Defaults to TRUE for gaussian models and FALSE for binomial models.
#' @param impute.y Should missing Y's be imputed? Defaults to FALSE.
#' @param lod Limit of detection value(s). Any Y value less than lod is
#' assumed to be missing when performing missing data imputation. Should be
#' numeric for univariate models and a list for multivariate models, where
#' each element of the list corresponds to an outcome. If not specified, it
#' is assumed that no limit of detection exists. Ignored if impute.y is FALSE.
#' @param verbose If TRUE, additional information about model fit will be
#' printed. Defaults to FALSE.
#' @param optim.method Optimization method to be used for the maximum
Expand Down Expand Up @@ -408,13 +422,33 @@ setMethod(
setMethod(
"prestogp_fit", "PrestoGPModel",
function(model, Y, X, locs, scaling = NULL, apanasovich = NULL,
covparams = NULL, beta.hat = NULL, tol = 0.999999,
max_iters = 100, verbose = FALSE, optim.method = "Nelder-Mead",
covparams = NULL, beta.hat = NULL, tol = 0.999999, max_iters = 100,
center.y = NULL, impute.y = FALSE, lod = NULL, verbose = FALSE,
optim.method = "Nelder-Mead",
optim.control = list(trace = 0, reltol = 1e-3, maxit = 5000),
family = c("gaussian", "binomial"),
nfolds = 10, foldid = NULL, parallel = FALSE) {
model <- check_input(model, Y, X, locs)
family <- match.arg(family)
if (is.null(center.y)) {
if (family == "gaussian") {
center.y <- TRUE
} else {
center.y <- FALSE
}
}
model <- check_input(model, Y, X, locs, center.y, impute.y, lod)
if (is.null(lod)) {
lodv <- rep(Inf, nrow(model@Y_train))
} else {
if (is.numeric(lod)) {
lod <- list(lod)
}
lodv <- NULL
for (i in seq_along(lod)) {
lodv <- c(lodv, rep(lod[[i]] - model@Y_bar[i],
nrow(model@locs_train[[i]])))
}
}
if (!is.null(beta.hat)) {
if (!is.vector(beta.hat) | !is.numeric(beta.hat)) {
stop("beta.hat parameter must be a numeric vector")
Expand Down Expand Up @@ -524,6 +558,9 @@ setMethod(
s = "lambda.1se",
type = "response")
)
model <- impute_y(model)
alod <- !model@Y_obs & model@Y_train > lodv
model@Y_train[alod] <- lodv[alod]
covparams.iter <- model@covparams
Vecchia.SCAD.iter <- model@linear_model
} else {
Expand Down Expand Up @@ -653,7 +690,7 @@ setMethod("calc_covparams", "PrestoGPModel", function(model, locs, Y, covparams)
col.vars <- rep(NA, P)
D.sample.bar <- rep(NA, model@nscale * P)
for (i in 1:P) {
col.vars[i] <- var(Y[[i]])
col.vars[i] <- var(Y[[i]], na.rm = TRUE)
N <- length(Y[[i]])
# TODO find a better way to compute initial spatial range
for (j in 1:model@nscale) {
Expand Down
116 changes: 112 additions & 4 deletions R/PrestoGP_Multivariate_Vecchia.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ setMethod("prestogp_predict", "MultivariateVecchiaModel",
ordering.pred <- match.arg(ordering.pred)
pred.cond <- match.arg(pred.cond)
return.values <- match.arg(return.values)
X <- check_input_pred(model, X, locs)
pred.list <- check_input_pred(model, X, locs)
X <- pred.list$X
Y_bar <- pred.list$Y_bar
if (is.null(m)) { # m defaults to the value used for training
m <- model@n_neighbors
}
Expand Down Expand Up @@ -91,7 +93,7 @@ setMethod("prestogp_predict", "MultivariateVecchiaModel",

# prediction function can return both mean and sds
# returns a list with elements mu.pred,mu.obs,var.pred,var.obs,V.ord
Vec.mean <- pred$mu.pred + Vecchia.Pred # residual + mean trend
Vec.mean <- Y_bar + pred$mu.pred + Vecchia.Pred # residual + mean trend
if (return.values == "mean") {
return.list <- list(means = Vec.mean)
} else {
Expand All @@ -112,7 +114,7 @@ setMethod("prestogp_predict", "MultivariateVecchiaModel",
}
)

setMethod("check_input", "MultivariateVecchiaModel", function(model, Y, X, locs) {
setMethod("check_input", "MultivariateVecchiaModel", function(model, Y, X, locs, center.y, impute.y, lod) {
if (!is.list(locs)) {
stop("locs must be a list for multivariate models")
}
Expand All @@ -128,6 +130,14 @@ setMethod("check_input", "MultivariateVecchiaModel", function(model, Y, X, locs)
if (length(locs) != length(X)) {
stop("locs and X must have the same length")
}
if (!is.null(lod)) {
if (!is.list(lod)) {
stop("lod must be a list for multivariate models")
}
if (length(locs) != length(lod)) {
stop("locs and lod must have the same length")
}
}
for (i in seq_along(locs)) {
if (!is.matrix(locs[[i]])) {
stop("Each locs must be a matrix")
Expand Down Expand Up @@ -155,9 +165,44 @@ setMethod("check_input", "MultivariateVecchiaModel", function(model, Y, X, locs)
if (nrow(Y[[i]]) != nrow(X[[i]])) {
stop("Each Y must have the same number of rows as X")
}
if (sum(is.na(X[[i]])) > 0) {
stop("X must not contain NA's")
}
if (sum(is.na(locs[[i]])) > 0) {
stop("locs must not contain NA's")
}
if (sum(is.na(Y[[i]])) > 0 & !impute.y) {
stop("Y contains NA's and impute.y is FALSE. Set impute.y=TRUE to impute missing Y's.")
}
if (!is.null(lod[[i]])) {
if (!is.numeric(lod[[i]])) {
stop("Each lod must be numeric")
}
if (length(lod[[i]]) != 1) {
stop("Each lod must have length 1")
}
}
}
Y_bar <- rep(NA, length(Y))
Y_obs <- NULL
for (i in seq_along(Y)) {
Y_obs <- c(Y_obs, !is.na(Y[[i]]))
if (!is.null(lod)) {
Y[[i]][is.na(Y[[i]])] <- lod[[i]] / 2
}
if (center.y) {
Y_bar[i] <- mean(Y[[i]], na.rm = TRUE)
Y[[i]] <- Y[[i]] - Y_bar[i]
Y[[i]][is.na(Y[[i]])] <- 0
} else {
Y[[i]][is.na(Y[[i]])] <- mean(Y[[i]], na.rm = TRUE)
Y_bar[i] <- 0
}
}
model@Y_bar <- Y_bar
model@locs_train <- locs
model@Y_train <- as.matrix(unlist(Y))
model@Y_obs <- Y_obs
if (length(X) == 1) {
model@X_train <- X[[1]]
} else {
Expand Down Expand Up @@ -195,13 +240,76 @@ setMethod("check_input_pred", "MultivariateVecchiaModel", function(model, X, loc
}
if (length(X) == 1) {
X <- X[[1]]
Y_bar <- rep(model@Y_bar[1], nrow(X))
} else {
Y_bar <- NULL
for (i in seq_along(X)) {
Y_bar <- c(Y_bar, rep(model@Y_bar[i], nrow(X[[i]])))
}
X <- psych::superMatrix(X)
}
if (ncol(X) != ncol(model@X_train)) {
stop("X and X_train must have the same number of predictors")
}
return(X)
return(list(X = X, Y_bar = Y_bar))
})

setMethod("impute_y", "MultivariateVecchiaModel", function(model) {
if (sum(!model@Y_obs) > 0) {
Vecchia.Pred <- predict(model@linear_model,
newx = model@X_train[!model@Y_obs, ],
s = model@linear_model$lambda[model@lambda_1se_idx])
Vecchia.hat <- predict(model@linear_model,
newx = model@X_train[model@Y_obs, ],
s = model@linear_model$lambda[model@lambda_1se_idx])

# Test set prediction
res <- model@Y_train[model@Y_obs] - Vecchia.hat

locs.scaled <- scale_locs(model, model@locs_train)
all.obs <- model@Y_obs
locs.otr <- list()
locs.otst <- list()
for (i in seq_along(model@locs_train)) {
nl <- nrow(locs.scaled[[i]])
cur.obs <- all.obs[1:nl]
locs.otr[[i]] <- locs.scaled[[i]][cur.obs, ]
locs.otst[[i]] <- locs.scaled[[i]][!cur.obs, ]
all.obs <- all.obs[-(1:nl)]
}

if (model@apanasovich & (length(model@locs_train) > 1)) {
locs.nd <- eliminate_dupes(locs.otr, locs.otst)
locs.otr <- locs.nd$locs
locs.otst <- locs.nd$locs.pred
}
vec.approx.test <- vecchia_Mspecify(locs.otr, model@n_neighbors,
locs.list.pred = locs.otst,
ordering.pred = "obspred",
pred.cond = "independent"
)

## carry out prediction
if (!model@apanasovich) {
params <- model@covparams
param.seq <- model@param_sequence
pred <- vecchia_Mprediction(res, vec.approx.test,
c(
params[1:param.seq[1, 2]],
rep(1, param.seq[2, 2] - param.seq[2, 1] + 1),
params[param.seq[3, 1]:param.seq[5, 2]]
),
return.values = "mean"
)
} else {
pred <- vecchia_Mprediction(res, vec.approx.test, model@covparams,
return.values = "mean"
)
}

model@Y_train[!model@Y_obs] <- pred$mu.pred + Vecchia.Pred
}
invisible(model)
})

setMethod("specify", "MultivariateVecchiaModel", function(model) {
Expand Down
75 changes: 73 additions & 2 deletions R/PrestoGP_Vecchia.R
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ setMethod("prestogp_predict", "VecchiaModel",

# prediction function can return both mean and sds
# returns a list with elements mu.pred,mu.obs,var.pred,var.obs,V.ord
Vec.mean <- pred$mu.pred + Vecchia.Pred # residual + mean trend
# residual + mean trend
Vec.mean <- model@Y_bar + pred$mu.pred + Vecchia.Pred
if (return.values == "mean") {
return.list <- list(means = Vec.mean)
} else {
Expand All @@ -94,7 +95,7 @@ setMethod("prestogp_predict", "VecchiaModel",
}
)

setMethod("check_input", "VecchiaModel", function(model, Y, X, locs) {
setMethod("check_input", "VecchiaModel", function(model, Y, X, locs, center.y, impute.y, lod) {
if (!is.matrix(locs)) {
stop("locs must be a matrix")
}
Expand All @@ -116,6 +117,35 @@ setMethod("check_input", "VecchiaModel", function(model, Y, X, locs) {
if (nrow(Y) != nrow(X)) {
stop("Y must have the same number of rows as X")
}
if (sum(is.na(X)) > 0) {
stop("X must not contain NA's")
}
if (sum(is.na(locs)) > 0) {
stop("locs must not contain NA's")
}
if (sum(is.na(Y)) > 0 & !impute.y) {
stop("Y contains NA's and impute.y is FALSE. Set impute.y=TRUE to impute missing Y's.")
}
if (!is.null(lod)) {
if (!is.numeric(lod)) {
stop("lod must be numeric")
}
if (length(lod) != 1) {
stop("lod must have length 1")
}
}
model@Y_obs <- !is.na(as.vector(Y))
if (!is.null(lod)) {
Y[!model@Y_obs] <- lod / 2
}
if (center.y) {
model@Y_bar <- mean(Y, na.rm = TRUE)
Y <- Y - model@Y_bar
Y[is.na(Y)] <- 0
} else {
model@Y_bar <- 0
Y[is.na(Y)] <- mean(Y, na.rm = TRUE)
}
model@X_train <- X
model@Y_train <- Y
model@locs_train <- list(locs)
Expand All @@ -141,6 +171,47 @@ setMethod("check_input_pred", "VecchiaModel", function(model, X, locs) {
invisible(model)
})

setMethod("impute_y", "VecchiaModel", function(model) {
if (sum(!model@Y_obs) > 0) {
Vecchia.Pred <- predict(model@linear_model,
newx = model@X_train[!model@Y_obs, ],
s = model@linear_model$lambda[model@lambda_1se_idx])
Vecchia.hat <- predict(model@linear_model,
newx = model@X_train[model@Y_obs, ],
s = model@linear_model$lambda[model@lambda_1se_idx])

# Test set prediction
res <- model@Y_train[model@Y_obs] - Vecchia.hat

locs.scaled <- scale_locs(model, model@locs_train)[[1]]
vec.approx.test <- vecchia_specify(locs.scaled[model@Y_obs, ],
model@n_neighbors,
locs.pred = locs.scaled[!model@Y_obs, ],
ordering.pred = "obspred", pred.cond = "independent")

## carry out prediction
if (!model@apanasovich) {
pred <- vecchia_prediction(
res,
vec.approx.test,
c(model@covparams[1], 1, model@covparams[3]),
model@covparams[4],
return.values = "mean"
)
} else {
pred <- vecchia_prediction(
res,
vec.approx.test,
c(model@covparams[1], model@covparams[2], model@covparams[3]),
model@covparams[4], return.values = "mean"
)
}

model@Y_train[!model@Y_obs] <- pred$mu.pred + Vecchia.Pred
}
invisible(model)
})

#' specify
#'
#' Specify the conditioning set using m nearest neighbors.
Expand Down
7 changes: 4 additions & 3 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

na_omit_c <- function(x) {
.Call('_PrestoGP_na_omit_c', PACKAGE = 'PrestoGP', x)
.Call('_PrestoGP_na_omit_c', PACKAGE = 'PrestoGP', x)
}

createU_helper_mat <- function(olocs, ondx, curqys, curqzs, vijs, aijs, full_const, nugget, sig2, U_beginning) {
.Call('_PrestoGP_createU_helper_mat', PACKAGE = 'PrestoGP', olocs, ondx, curqys, curqzs, vijs, aijs, full_const, nugget, sig2, U_beginning)
}
.Call('_PrestoGP_createU_helper_mat', PACKAGE = 'PrestoGP', olocs, ondx, curqys, curqzs, vijs, aijs, full_const, nugget, sig2, U_beginning)
}

Empty file modified R/Visualization.R
100644 → 100755
Empty file.
Empty file modified R/utils-tidy-eval.R
100644 → 100755
Empty file.
Empty file modified man/FullModel-class.Rd
100644 → 100755
Empty file.
Empty file modified man/MultivariateVecchiaModel-class.Rd
100644 → 100755
Empty file.
Empty file modified man/PrestoGP-package.Rd
100644 → 100755
Empty file.
Loading

0 comments on commit 6bafe5c

Please sign in to comment.