NIEHS · ericbair-sciome · Nov 15, 2023 · Jul 17, 2023 · Jul 18, 2023 · Jul 21, 2023
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -0,0 +1,7 @@
+^examples$
+^Makefile$
+^R/scratch.R$
+^.*\.Rproj$
+^\.Rproj\.user$
+^doc$
+^Meta$
diff --git a/.gitignore b/.gitignore
@@ -21,6 +21,7 @@ manuscript/
 **/._.DS_Store
 ../._create-prediction-grid.Rmd
 ../._map-nc-land-covariates.Rmd
+
 # Hidden folders
 .DS_Store/
 ._.DS_Store/
@@ -73,4 +74,16 @@ code/mitchell_tests/
 ._*
 
 # Insang's negative value exploration script
-tools/negative_exploration
+tools/negative_exploration
+
+# data files
+data
+
+# Automatic Emacs backup files
+**/*~
+
+# Rcpp compiled binary files
+src/*.so
+src/*.o
+/doc/
+/Meta/
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,7 @@
 Package: PrestoGP
 Type: Package
-Title: Penalized Regression for Spatio-Temporal Outcomes via Gaussian
-        Processes
-Version: 0.2.0.9015
+Title: Penalized Regression for Spatio-Temporal Outcomes via Gaussian Processes 
+Version: 0.2.0.9018
 Authors@R: c(
     person(given = "Eric",
            family = "Bair",
@@ -21,26 +20,41 @@ Authors@R: c(
            family = "Messier",
 	   role = "aut"))
 Description: Simultaneous variable seletion and estimation of LUR models with spatiotemporally correlated errors that is scalable for big data.
-Depends: R (>= 3.5.0)
-LinkingTo: Rcpp, RcppArmadillo
-Imports: GPvecchia, Matrix, fields, ncvreg, readxl, scoringRules, MASS,
-        aod, knitr, dplyr, glmnet, rmarkdown, markdown, gtools, geoR,
-        doParallel
+Depends:
+    R (>= 3.5.0)
+LinkingTo:
+    Rcpp, RcppArmadillo
+Imports:
+    GPvecchia,
+    Matrix,
+    fields,
+    ncvreg,
+    readxl,
+    scoringRules,
+    MASS,
+    aod,
+    knitr,
+    dplyr,
+    glmnet,
+    rmarkdown,
+    markdown,
+    gtools,
+    geoR,
+    doParallel
 License: GPL-3
 Encoding: UTF-8
-VignetteBuilder: knitr
+VignetteBuilder: knitr 
 RoxygenNote: 7.2.3
-Collate: 'Log_Likelihood.R' 'PrestoGP_CreateU_Multivariate.R'
-        'PrestoGP_Model.R' 'PrestoGP_Vecchia_Spatiotemporal.R'
-        'PrestoGP_Full.R' 'PrestoGP_Vecchia_Spatial.R'
-        'PrestoGP_Full_Spatial.R' 'PrestoGP_Multivariate_Vecchia.R'
-        'PrestoGP_Util_Functions.R' 'RcppExports.R' 'Visualization.R'
-        'package.R'
-NeedsCompilation: yes
-Packaged: 2023-10-23 09:07:48 UTC; root
-Author: Eric Bair [aut, cre],
-  Brian Kidd [aut],
-  Eric Wimberley [aut],
-  Deepak Mav [aut],
-  Kyle Messier [aut]
-Maintainer: Eric Bair <eric.bair@sciome.com>
+Collate: 
+    'Log_Likelihood.R'
+    'PrestoGP_CreateU_Multivariate.R'
+    'PrestoGP_Model.R'
+    'PrestoGP_Vecchia_Spatiotemporal.R'
+    'PrestoGP_Full.R'
+    'PrestoGP_Vecchia_Spatial.R'
+    'PrestoGP_Full_Spatial.R'
+    'PrestoGP_Multivariate_Vecchia.R'
+    'PrestoGP_Util_Functions.R'
+    'RcppExports.R'
+    'Visualization.R'
+    'package.R'
diff --git a/Makefile b/Makefile
@@ -0,0 +1,21 @@
+PKGNAME := $(shell sed -n "s/Package: *\([^ ]*\)/\1/p" DESCRIPTION)
+PKGVERS := $(shell sed -n "s/Version: *\([^ ]*\)/\1/p" DESCRIPTION)
+PKGSRC  := $(shell basename `pwd`)
+
+all: clean build
+
+clean: 
+	echo "Clean"
+
+build: doc
+	echo $(PKGNAME) $(PKGVERS)
+	cd ..;\
+	R CMD build $(PKGSRC)
+
+doc:
+	R --slave -e 'library(roxygen2); roxygenise()'
+	R --slave -e 'library(devtools); build_manual()'
+	#-git add --all man/*.Rd
+
+check:
+	R --slave -e 'library(devtools); check(error_on="error")'
diff --git a/NAMESPACE b/NAMESPACE
@@ -9,8 +9,8 @@ export(SpatialModel)
 export(SpatiotemporalFullModel)
 export(SpatiotemporalModel)
 export(createUMultivariate)
+export(negloglik.full)
 export(negloglik_full_ST)
-export(negloglik_full_spatial)
 export(negloglik_vecchia)
 export(negloglik_vecchia_ST)
 export(vecchia_Mprediction)

diff --git a/R/Log_Likelihood.R b/R/Log_Likelihood.R
@@ -60,7 +60,7 @@ negloglik_full_ST=function(logparms,locs,y,N){
   -mvtnorm::dmvnorm(y, rep(0,N), cov.mat, log=TRUE)
 }
 
-#' negloglik_full_spatial
+#' negloglik.full
 #'
 #' Spatial Full Kriging negative loglikelihood
 #'
@@ -74,12 +74,15 @@ negloglik_full_ST=function(logparms,locs,y,N){
 #'
 #' @examples
 #' @noRd
-negloglik_full_spatial=function(logparms,locs,y,N){
-  parms = exp(logparms)
-  d <- fields::rdist(locs)
-  cov.mat=parms[1]*fields::Exponential(d,range=parms[2])+
-    parms[3]*diag(N)
-  -mvtnorm::dmvnorm(y,rep(0,N),cov.mat,log=TRUE)
+negloglik.full=function(logparams,locs,y){
+    params <- c(exp(logparams[1:2]),
+                gtools::inv.logit(logparams[3], 0, 2.5),
+                exp(logparams[4]))
+    d <- fields::rdist(locs)
+    N <- nrow(d)
+    cov.mat=params[1]*fields::Matern(d,range=params[2], smoothness=params[3])+
+        params[4]*diag(N)
+    return(-1*mvtnorm::dmvnorm(y,rep(0,N),cov.mat,log=TRUE))
 }
 
 
@@ -157,7 +160,7 @@ mvnegloglik_ST =function(logparams,vecchia.approx,y,param.seq,P,scaling,nscale){
         for (j in 1:nscale) {
             locs.scaled[vecchia.approx$ondx==i,scaling==j] <-
                 locs.scaled[vecchia.approx$ondx==i,scaling==j] /
-                params[param.seq[2,1]+nscale*(i-1)+j-1] 
+                params[param.seq[2,1]+nscale*(i-1)+j-1]
         }
     }
     vecchia.approx$locsord <- locs.scaled
@@ -171,7 +174,7 @@ mvnegloglik_ST =function(logparams,vecchia.approx,y,param.seq,P,scaling,nscale){
 ##############################################################################
 ### Full Multivariate Matern Negative Loglikelihood Function ###########
 
-mvnegloglik.full=function(logparams,locs,y,param.seq,P){
+mvnegloglik.full=function(logparams,locs,y,param.seq){
   #  Input-
   #  logparams: A numeric vector of length (4*P)+(4*choose(P,2)).
   #             To construct these parameters we unlist a list of the 7 covariance
@@ -191,6 +194,7 @@ mvnegloglik.full=function(logparams,locs,y,param.seq,P){
 
   #P <- length(y)
   # transform the postively constrained parameters from log-space to normal-space
+  P <- length(locs)
   params <- c(exp(logparams[1:param.seq[2,2]]),
               gtools::inv.logit(logparams[param.seq[3,1]:param.seq[3,2]], 0, 2.5),
               exp(logparams[param.seq[4,1]:param.seq[4,2]]))
@@ -279,12 +283,17 @@ cat.covariances <- function(locs.list,sig2,range,smoothness,nugget){
   for (iter in 1:nrow(combs)){
     i <- combs[iter,1]
     j <- combs[iter,2]
-    d <- fields::rdist.earth(locs.list[[i]],locs.list[[j]],miles = FALSE)
-
+    # d <- fields::rdist.earth(locs.list[[i]],locs.list[[j]],miles = FALSE)
+    d <- fields::rdist(locs.list[[i]],locs.list[[j]])
     # Calculate the covariance matrix - if/then based on its location in the super-matrix
-    N <- nrow(d)
-    cov.mat.ij <- sig2[i,j]*geoR::matern(d,phi = range[i,j],kappa = smoothness[i,j])+
-      nugget[i,j]*diag(N)
+      N <- nrow(d)
+      if (i==j){ # To accomodate varying size outcomes- the nugget is not included on cross-covariances
+        cov.mat.ij <- sig2[i,j]*geoR::matern(d,phi = range[i,j],kappa = smoothness[i,j])+
+          nugget[i,j]*diag(N)
+      }else{
+        cov.mat.ij <- sig2[i,j]*geoR::matern(d,phi = range[i,j],kappa = smoothness[i,j])
+      }
+
 
     if (combs[iter,1]==1){
       row.idx <- 1:dims[1]
@@ -307,6 +316,7 @@ cat.covariances <- function(locs.list,sig2,range,smoothness,nugget){
 
   }
 
+
   return(cov.mat.out)
 }
 

diff --git a/R/PrestoGP_CreateU_Multivariate.R b/R/PrestoGP_CreateU_Multivariate.R
@@ -167,6 +167,11 @@ knn_indices <- function(ordered_locs, query, n_neighbors, dist_func){
 #'
 #' @return A list containing two matrices, each with one row per location: an indices matrix with the indices of nearest neighbors for each location, and a distance matrix with the associated distances
 sparseNN <- function(ordered_locs, n_neighbors, dist_func, ordered_locs_pred=NULL){
+    ee <- min(apply(ordered_locs, 2, stats::sd))
+    n <- nrow(ordered_locs)
+    ordered_locs <- ordered_locs + matrix(ee*1e-04*
+                                          stats::rnorm(n*ncol(ordered_locs)),
+                                          n, ncol(ordered_locs))
   indices_matrix = matrix(data=NA, nrow=nrow(ordered_locs), ncol=n_neighbors)
   distances_matrix = matrix(data=NA, nrow=nrow(ordered_locs), ncol=n_neighbors)
   for(row in 1:n_neighbors){

diff --git a/R/PrestoGP_Model.R b/R/PrestoGP_Model.R
@@ -60,14 +60,14 @@ setMethod("initialize", "PrestoGPModel", function(.Object, ...) {
 })
 
 setGeneric("show_theta", function(object, Y_names)standardGeneric("show_theta") )
-setGeneric("prestogp_fit", function(model, Y, X, locs, scaling=NULL, apanasovich=FALSE, covparams = NULL, beta.hat = NULL, tol = 0.999999, max_iters = 100, verbose=FALSE, optim.method="Nelder-Mead", optim.control=list(trace=0, reltol=1e-4, maxit=5000), parallel=FALSE) standardGeneric("prestogp_fit") )
+setGeneric("prestogp_fit", function(model, Y, X, locs, scaling=NULL, apanasovich=FALSE, covparams = NULL, beta.hat = NULL, tol = 0.999999, max_iters = 100, verbose=FALSE, optim.method="Nelder-Mead", optim.control=list(trace=0, reltol=1e-3, maxit=5000), parallel=FALSE, foldid=NULL) standardGeneric("prestogp_fit") )
 setGeneric("prestogp_predict", function(model, X="matrix", locs="matrix", m="numeric", ordering.pred=c("obspred", "general"), pred.cond=c("independent", "general"), return.values=c("mean", "meanvar")) standardGeneric("prestogp_predict") )
 setGeneric("calc_covparams", function(model, locs, Y) standardGeneric("calc_covparams") )
 setGeneric("specify", function(model, locs, m)standardGeneric("specify") )
 setGeneric("compute_residuals", function(model, Y, Y.hat) standardGeneric("compute_residuals") )
 setGeneric("transform_data", function(model, Y, X) standardGeneric("transform_data") )
 setGeneric("estimate_theta", function(model, locs, optim.control, method) standardGeneric("estimate_theta") )
-setGeneric("estimate_betas", function(model, parallel) standardGeneric("estimate_betas") )
+setGeneric("estimate_betas", function(model, parallel, foldid) standardGeneric("estimate_betas") )
 setGeneric("compute_error", function(model, y, X) standardGeneric("compute_error") )
 setGeneric("scale_locs", function(model, locs) standardGeneric("scale_locs") )
 setGeneric("theta_names", function(model) standardGeneric("theta_names") )
@@ -181,7 +181,7 @@ setMethod("show_theta", "PrestoGPModel",
 #' model <- prestogp_fit(model, logNO2, X, locs)
 #' ...
 setMethod("prestogp_fit", "PrestoGPModel",
-          function(model, Y, X, locs, scaling=NULL, apanasovich=FALSE, covparams = NULL, beta.hat = NULL, tol = 0.999999, max_iters=100, verbose=FALSE, optim.method="Nelder-Mead", optim.control=list(trace=0, reltol=1e-4, maxit=5000), parallel=FALSE) {
+          function(model, Y, X, locs, scaling=NULL, apanasovich=FALSE, covparams = NULL, beta.hat = NULL, tol = 0.999999, max_iters=100, verbose=FALSE, optim.method="Nelder-Mead", optim.control=list(trace=0, reltol=1e-3, maxit=5000), parallel=FALSE, foldid=NULL) {
             #parameter validation
             #TODO: This method should check for input errors in the
             #multivariate case (where Y, X, and locs are lists)
@@ -193,7 +193,7 @@ setMethod("prestogp_fit", "PrestoGPModel",
             if(!is.double(tol)){ stop("The tol parameter must be floating point number.") }
             if (is.matrix(Y)) {
                 if(nrow(Y) != nrow(X)){ stop("Y must have the same number of rows as X.") }
-                if(ncol(Y) < 1){ stop("Y must have at least 1 column.") }
+                if(ncol(Y) != 1){ stop("Y must have only 1 column.") }
                 if(nrow(Y) != nrow(locs)){ stop("Y must have the same number of rows as locs.") }
             }
             if (is.null(scaling)) {
@@ -265,7 +265,9 @@ setMethod("prestogp_fit", "PrestoGPModel",
               model <- specify(model, locs, m)
 
               if (is.null(beta.hat)) {
-                  beta0.glmnet <- cv.glmnet(model@X_train, model@Y_train)
+                  beta0.glmnet <- cv.glmnet(model@X_train, model@Y_train,
+                                            parallel=parallel,
+                                            foldid=foldid)
                   beta.hat <- as.matrix(predict(beta0.glmnet,
                                                 type="coefficients",
                                                 s=beta0.glmnet$lambda.1se))
@@ -288,7 +290,7 @@ setMethod("prestogp_fit", "PrestoGPModel",
                   model <- specify(model, locs, m)
               }
               model <- transform_data(model, model@Y_train, model@X_train)
-              model <- estimate_betas(model, parallel)
+              model <- estimate_betas(model, parallel, foldid)
               min.error <- compute_error(model)
               ### Check min-error against the previous error and tolerance
               if(min.error<prev.error*tol) {
@@ -325,11 +327,11 @@ setMethod("prestogp_fit", "PrestoGPModel",
 #' @param model the model to estimate coeffients for
 #'
 #' @return A model with updated coefficients
-setMethod("estimate_betas", "PrestoGPModel", function(model, parallel) {
+setMethod("estimate_betas", "PrestoGPModel", function(model, parallel, foldid) {
   if(ncol(model@Y_train) > 1){
-    model@linear_model <- cv.glmnet(as.matrix(model@X_tilde), as.matrix(model@y_tilde), family="mgaussian", alpha = model@alpha, parallel=parallel)
+    model@linear_model <- cv.glmnet(as.matrix(model@X_tilde), as.matrix(model@y_tilde), family="mgaussian", alpha = model@alpha, parallel=parallel, foldid=foldid)
   } else {
-    model@linear_model <- cv.glmnet(as.matrix(model@X_tilde), as.matrix(model@y_tilde), alpha = model@alpha, parallel=parallel)
+    model@linear_model <- cv.glmnet(as.matrix(model@X_tilde), as.matrix(model@y_tilde), alpha = model@alpha, parallel=parallel, foldid=foldid)
   }
   idmin <- which(model@linear_model$lambda == model@linear_model$lambda.min)
   semin <- model@linear_model$cvm[idmin] + model@linear_model$cvsd[idmin]