no memory leaks

HannaMeyer · Jan 9, 2025 · 9b18abb · 9b18abb
1 parent e7bd2fa
commit 9b18abb
Show file tree

Hide file tree

Showing 14 changed files with 89 additions and 62 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: CAST
 Type: Package
 Title: 'caret' Applications for Spatial-Temporal Models
-Version: 1.0.2
+Version: 1.0.3
 Authors@R: c(person("Hanna", "Meyer", email = "hanna.meyer@uni-muenster.de", role = c("cre", "aut")),
              person("Carles", "Milà", role = c("aut")),
              person("Marvin", "Ludwig", role = c("aut")),
@@ -10,8 +10,9 @@ Authors@R: c(person("Hanna", "Meyer", email = "hanna.meyer@uni-muenster.de", rol
              person("Philipp", "Otto", role = c("ctb")),
              person("Chris", "Reudenbach", role = c("ctb")),
              person("Thomas", "Nauss", role = c("ctb")),
-             person("Edzer", "Pebesma", role = c("ctb")))
-Author: Hanna Meyer [cre, aut], Carles Milà [aut], Marvin Ludwig [aut], Jan Linnenbrink [aut], Fabian Schumacher [aut], Philipp Otto [ctb], Chris Reudenbach [ctb], Thomas Nauss [ctb], Edzer Pebesma [ctb]
+             person("Edzer", "Pebesma", role = c("ctb")),
+             person("Jakub", "Nowosad", role = c("ctb")))
+Author: Hanna Meyer [cre, aut], Carles Milà [aut], Marvin Ludwig [aut], Jan Linnenbrink [aut], Fabian Schumacher [aut], Philipp Otto [ctb], Chris Reudenbach [ctb], Thomas Nauss [ctb], Edzer Pebesma [ctb], Jakub Nowosad [ctb]
 Maintainer: Hanna Meyer <hanna.meyer@uni-muenster.de>
 Description: Supporting functionality to run 'caret' with spatial or spatial-temporal data. 'caret' is a frequently used package for model training and prediction using machine learning. CAST includes functions to improve spatial or spatial-temporal modelling tasks using 'caret'. It includes the newly suggested 'Nearest neighbor distance matching' cross-validation to estimate the performance of spatial prediction models and allows for spatial variable selection to selects suitable predictor variables in view to their contribution to the spatial model performance. CAST further includes functionality to estimate the (spatial) area of applicability of prediction models. Methods are described in Meyer et al. (2018) <doi:10.1016/j.envsoft.2017.12.001>; Meyer et al. (2019) <doi:10.1016/j.ecolmodel.2019.108815>; Meyer and Pebesma (2021) <doi:10.1111/2041-210X.13650>; Milà et al. (2022) <doi:10.1111/2041-210X.13851>; Meyer and Pebesma (2022) <doi:10.1038/s41467-022-29838-9>; Linnenbrink et al. (2023) <doi:10.5194/egusphere-2023-1308>; Schumacher et al. (2024) <doi:10.5194/egusphere-2024-2730>. The package is described in detail in Meyer et al. (2024) <doi:10.48550/arXiv.2404.06978>.
 License: GPL (>= 2)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,6 @@
+# `CAST` 1.0.3
+* bug fix: default algorithm for FNN functions changed
+
 # `CAST` 1.0.2
 * bug fix: tests run conditionally
 

diff --git a/R/aoa.R b/R/aoa.R
@@ -31,6 +31,7 @@
 #' @param maxLPD numeric or integer. Only if \code{LPD = TRUE}. Number of nearest neighbors to be considered for the calculation of the LPD. Either define a number between 0 and 1 to use a percentage of the number of training samples for the LPD calculation or a whole number larger than 1 and smaller than the number of training samples. CAUTION! If not all training samples are considered, a fitted relationship between LPD and error metric will not make sense (@seealso \code{\link{DItoErrormetric}})
 #' @param indices logical. Calculate indices of the training data points that are responsible for the LPD of a new prediction location? Output is a matrix with the dimensions num(raster_cells) x maxLPD. Each row holds the indices of the training data points that are relevant for the specific LPD value at that location. Can be used in combination with exploreAOA(aoa) function from the \href{https://github.com/fab-scm/CASTvis}{CASTvis package} for a better visual interpretation of the results. Note that the matrix can be quite big for examples with a high resolution and a larger number of training samples, which can cause memory issues.
 #' @param verbose Logical. Print progress or not?
+#' @param algorithm see \code{\link[FNN]{knnx.dist}} and \code{\link[FNN]{knnx.index}}
 #' @details The Dissimilarity Index (DI), the Local Data Point Density (LPD) and the corresponding Area of Applicability (AOA) are calculated.
 #' If variables are factors, dummy variables are created prior to weighting and distance calculation.
 #'
@@ -157,7 +158,8 @@ aoa <- function(newdata,
                 LPD = FALSE,
                 maxLPD = 1,
                 indices = FALSE,
-                verbose = TRUE) {
+                verbose = TRUE,
+                algorithm = "brute") {
 
   # handling of different raster formats
   as_stars <- FALSE
@@ -214,7 +216,7 @@ aoa <- function(newdata,
     if (verbose) {
       message("No trainDI provided.")
     }
-    trainDI <- trainDI(model, train, variables, weight, CVtest, CVtrain, method, useWeight, useCV, LPD, verbose)
+    trainDI <- trainDI(model, train, variables, weight, CVtest, CVtrain, method, useWeight, useCV, LPD, verbose, algorithm=algorithm)
   }
 
   if (calc_LPD == TRUE) {
@@ -314,7 +316,7 @@ aoa <- function(newdata,
     }
     mindist <- rep(NA, nrow(newdata))
     mindist[okrows] <-
-      .mindistfun(newdataCC, train_scaled, method, S_inv)
+      .mindistfun(newdataCC, train_scaled, method, S_inv,algorithm=algorithm)
     DI_out <- mindist / trainDI$trainDist_avrgmean
   }
 
@@ -335,13 +337,13 @@ aoa <- function(newdata,
       Indices_out <- matrix(NA, nrow = nrow(newdata), ncol = maxLPD)
     }
     for (i in seq(nrow(newdataCC))) {
-      knnDist  <- .knndistfun(t(matrix(newdataCC[i,])), train_scaled, method, S_inv, maxLPD = maxLPD)
+      knnDist  <- .knndistfun(t(matrix(newdataCC[i,])), train_scaled, method, S_inv, maxLPD = maxLPD, algorithm=algorithm)
       knnDI <- knnDist / trainDI$trainDist_avrgmean
       knnDI <- c(knnDI)
 
       DI_out[okrows[i]] <- knnDI[1]
       LPD_out[okrows[i]] <- sum(knnDI < trainDI$threshold)
-      knnIndex  <- .knnindexfun(t(matrix(newdataCC[i,])), train_scaled, method, S_inv, maxLPD = LPD_out[okrows[i]])
+      knnIndex  <- .knnindexfun(t(matrix(newdataCC[i,])), train_scaled, method, S_inv, maxLPD = LPD_out[okrows[i]],algorithm=algorithm)
 
       if (indices) {
         if (LPD_out[okrows[i]] > 0) {
@@ -449,10 +451,10 @@ aoa <- function(newdata,
             reference,
             method,
             S_inv = NULL,
-            maxLPD = maxLPD) {
+            maxLPD = maxLPD, algorithm) {
     if (method == "L2") {
       # Euclidean Distance
-      return(FNN::knnx.dist(reference, point, k = maxLPD))
+      return(FNN::knnx.dist(reference, point, k = maxLPD, algorithm = algorithm))
     } else if (method == "MD") {
       return(t(sapply(1:dim(point)[1],
                       function(y)
@@ -467,10 +469,10 @@ aoa <- function(newdata,
             reference,
             method,
             S_inv = NULL,
-            maxLPD = maxLPD) {
+            maxLPD = maxLPD, algorithm) {
     if (method == "L2") {
       # Euclidean Distance
-      return(FNN::knnx.index(reference, point, k = maxLPD))
+      return(FNN::knnx.index(reference, point, k = maxLPD, algorithm = algorithm))
     } else if (method == "MD") {
       stop("MD currently not implemented for LPD")
     }

diff --git a/R/geodist.R b/R/geodist.R
@@ -16,6 +16,7 @@
 #' @param variables character vector defining the predictor variables used if type="feature. If not provided all variables included in modeldomain are used.
 #' @param timevar optional. character. Column that indicates the date. Only used if type="time".
 #' @param time_unit optional. Character. Unit for temporal distances See ?difftime.Only used if type="time".
+#' @param algorithm see \code{\link[FNN]{knnx.dist}} and \code{\link[FNN]{knnx.index}}
 #' @return A data.frame containing the distances. Unit of returned geographic distances is meters. attributes contain W statistic between prediction area and either sample data, CV folds or test data. See details.
 #' @details The modeldomain is a sf polygon or a raster that defines the prediction area. The function takes a regular point sample (amount defined by samplesize) from the spatial extent.
 #'     If type = "feature", the argument modeldomain (and if provided then also the testdata and/or preddata) has to include predictors. Predictor values for x, testdata and preddata are optional if modeldomain is a raster.
@@ -129,7 +130,8 @@ geodist <- function(x,
                     sampling = "regular",
                     variables=NULL,
                     timevar=NULL,
-                    time_unit="auto"){
+                    time_unit="auto",
+                    algorithm="brute"){
 
   # input formatting ------------
   if(is.null(modeldomain)&!is.null(preddata)){
@@ -221,22 +223,22 @@ geodist <- function(x,
   }
 
   # always do sample-to-sample and sample-to-prediction
-  s2s <- sample2sample(x, type,variables,time_unit,timevar, catVars)
-  s2p <- sample2prediction(x, modeldomain, type, samplesize,variables,time_unit,timevar, catVars)
+  s2s <- sample2sample(x, type,variables,time_unit,timevar, catVars, algorithm=algorithm)
+  s2p <- sample2prediction(x, modeldomain, type, samplesize,variables,time_unit,timevar, catVars, algorithm=algorithm)
 
   dists <- rbind(s2s, s2p)
 
   # optional steps ----
   ##### Distance to test data:
   if(!is.null(testdata)){
-    s2t <- sample2test(x, testdata, type,variables,time_unit,timevar, catVars)
+    s2t <- sample2test(x, testdata, type,variables,time_unit,timevar, catVars, algorithm=algorithm)
     dists <- rbind(dists, s2t)
   }
 
   ##### Distance to CV data:
   if(!is.null(cvfolds)){
 
-    cvd <- cvdistance(x, cvfolds, cvtrain, type, variables,time_unit,timevar, catVars)
+    cvd <- cvdistance(x, cvfolds, cvtrain, type, variables,time_unit,timevar, catVars, algorithm=algorithm)
     dists <- rbind(dists, cvd)
   }
   class(dists) <- c("geodist", class(dists))
@@ -270,7 +272,7 @@ geodist <- function(x,
 
 # Sample to Sample Distance
 
-sample2sample <- function(x, type,variables,time_unit,timevar, catVars){
+sample2sample <- function(x, type,variables,time_unit,timevar, catVars, algorithm){
   if(type == "geo"){
     sf::sf_use_s2(TRUE)
     d <- sf::st_distance(x)
@@ -301,7 +303,7 @@ sample2sample <- function(x, type,variables,time_unit,timevar, catVars){
     for (i in 1:nrow(x_clean)){
 
       if(is.null(catVars)) {
-        trainDist <-  FNN::knnx.dist(x_clean[i,],x_clean,k=1)
+        trainDist <-  FNN::knnx.dist(x_clean[i,],x_clean,k=1, algorithm=algorithm)
       } else {
         trainDist <- gower::gower_dist(x_clean[i,],x_clean)
       }
@@ -331,7 +333,7 @@ sample2sample <- function(x, type,variables,time_unit,timevar, catVars){
 
 
 # Sample to Prediction
-sample2prediction = function(x, modeldomain, type, samplesize,variables,time_unit,timevar, catVars){
+sample2prediction = function(x, modeldomain, type, samplesize,variables,time_unit,timevar, catVars, algorithm){
 
   if(type == "geo"){
     modeldomain <- sf::st_transform(modeldomain, sf::st_crs(x))
@@ -378,7 +380,7 @@ sample2prediction = function(x, modeldomain, type, samplesize,variables,time_uni
     for (i in 1:nrow(modeldomain)){
 
       if(is.null(catVars)) {
-        trainDist <-  FNN::knnx.dist(modeldomain[i,],x_clean,k=1)
+        trainDist <-  FNN::knnx.dist(modeldomain[i,],x_clean,k=1, algorithm=algorithm)
       } else {
         trainDist <- gower::gower_dist(modeldomain[i,], x_clean)
       }
@@ -410,7 +412,7 @@ sample2prediction = function(x, modeldomain, type, samplesize,variables,time_uni
 # sample to test
 
 
-sample2test <- function(x, testdata, type,variables,time_unit,timevar, catVars){
+sample2test <- function(x, testdata, type,variables,time_unit,timevar, catVars, algorithm){
 
   if(type == "geo"){
     testdata <- sf::st_transform(testdata,4326)
@@ -460,7 +462,7 @@ sample2test <- function(x, testdata, type,variables,time_unit,timevar, catVars){
     for (i in 1:nrow(testdata)){
 
       if(is.null(catVars)) {
-        testDist <- FNN::knnx.dist(testdata[i,],x_clean,k=1)
+        testDist <- FNN::knnx.dist(testdata[i,],x_clean,k=1, algorithm=algorithm)
       } else {
         testDist <- gower::gower_dist(testdata[i,], x_clean)
       }
@@ -491,7 +493,7 @@ sample2test <- function(x, testdata, type,variables,time_unit,timevar, catVars){
 
 # between folds
 
-cvdistance <- function(x, cvfolds, cvtrain, type, variables,time_unit,timevar, catVars){
+cvdistance <- function(x, cvfolds, cvtrain, type, variables,time_unit,timevar, catVars, algorithm){
 
   if(!is.null(cvfolds)&!is.list(cvfolds)){ # restructure input if CVtest only contains the fold ID
     tmp <- list()
@@ -550,7 +552,7 @@ cvdistance <- function(x, cvfolds, cvtrain, type, variables,time_unit,timevar, c
       for (k in 1:nrow(testdata_i)){
 
         if(is.null(catVars)) {
-          trainDist <-  tryCatch(FNN::knnx.dist(testdata_i[k,],traindata_i,k=1),
+          trainDist <-  tryCatch(FNN::knnx.dist(testdata_i[k,],traindata_i,k=1, algorithm=algorithm),
                                  error = function(e)e)
           if(inherits(trainDist, "error")){
             trainDist <- NA

diff --git a/R/knndm.R b/R/knndm.R
@@ -19,7 +19,7 @@
 #' Only required if modeldomain is used instead of predpoints.
 #' @param useMD boolean. Only for `space`=feature: shall the Mahalanobis distance be calculated instead of Euclidean?
 #' Only works with numerical variables.
-#'
+#' @param algorithm see \code{\link[FNN]{knnx.dist}} and \code{\link[FNN]{knnx.index}}
 #' @return An object of class \emph{knndm} consisting of a list of eight elements:
 #' indx_train, indx_test (indices of the observations to use as
 #' training/test data in each kNNDM CV iteration), Gij (distances for
@@ -205,7 +205,8 @@ knndm <- function(tpoints, modeldomain = NULL, predpoints = NULL,
                   space = "geographical",
                   k = 10, maxp = 0.5,
                   clustering = "hierarchical", linkf = "ward.D2",
-                  samplesize = 1000, sampling = "regular", useMD=FALSE){
+                  samplesize = 1000, sampling = "regular", useMD=FALSE,
+                  algorithm="brute"){
 
   # create sample points from modeldomain
   if(is.null(predpoints)&!is.null(modeldomain)){
@@ -312,14 +313,14 @@ knndm <- function(tpoints, modeldomain = NULL, predpoints = NULL,
     # prior checks
     check_knndm_geo(tpoints, predpoints, space, k, maxp, clustering, islonglat)
     # kNNDM in geographical space
-    knndm_res <- knndm_geo(tpoints, predpoints, k, maxp, clustering, linkf, islonglat)
+    knndm_res <- knndm_geo(tpoints, predpoints, k, maxp, clustering, linkf, islonglat, algorithm=algorithm)
 
   } else if (isTRUE(space == "feature")) {
 
     # prior checks
     check_knndm_feature(tpoints, predpoints, space, k, maxp, clustering, islonglat, catVars,useMD)
     # kNNDM in feature space
-    knndm_res <- knndm_feature(tpoints, predpoints, k, maxp, clustering, linkf, catVars, useMD)
+    knndm_res <- knndm_feature(tpoints, predpoints, k, maxp, clustering, linkf, catVars, useMD, algorithm=algorithm)
 
   }
 
@@ -379,7 +380,7 @@ check_knndm_feature <- function(tpoints, predpoints, space, k, maxp, clustering,
 
 
 # kNNDM in the geographical space
-knndm_geo <- function(tpoints, predpoints, k, maxp, clustering, linkf, islonglat){
+knndm_geo <- function(tpoints, predpoints, k, maxp, clustering, linkf, islonglat, algorithm){
 
   # Gj and Gij calculation
   tcoords <- sf::st_coordinates(tpoints)[,1:2]
@@ -392,9 +393,9 @@ knndm_geo <- function(tpoints, predpoints, k, maxp, clustering, linkf, islonglat
     units(Gij) <- NULL
     Gij <- apply(Gij, 1, min)
   }else{
-    Gj <- c(FNN::knn.dist(tcoords, k = 1))
+    Gj <- c(FNN::knn.dist(tcoords, k = 1, algorithm=algorithm))
     Gij <- c(FNN::knnx.dist(query = sf::st_coordinates(predpoints)[,1:2],
-                            data = tcoords, k = 1))
+                            data = tcoords, k = 1, algorithm=algorithm))
   }
 
   # Check if Gj > Gij (warning suppressed regarding ties)
@@ -406,7 +407,7 @@ knndm_geo <- function(tpoints, predpoints, k, maxp, clustering, linkf, islonglat
     if(isTRUE(islonglat)){
       Gjstar <- distclust_distmat(distmat, clust)
     }else{
-      Gjstar <- distclust_euclidean(tcoords, clust)
+      Gjstar <- distclust_euclidean(tcoords, clust, algorithm=algorithm)
     }
     k_final <- "random CV"
     W_final <- twosamples::wass_stat(Gjstar, Gij)
@@ -479,7 +480,7 @@ knndm_geo <- function(tpoints, predpoints, k, maxp, clustering, linkf, islonglat
         if(isTRUE(islonglat)){
           Gjstar_i <- distclust_distmat(distmat, clust_k)
         }else{
-          Gjstar_i <- distclust_euclidean(tcoords, clust_k)
+          Gjstar_i <- distclust_euclidean(tcoords, clust_k,algorithm=algorithm)
         }
         clustgrid$W[clustgrid$nk==nk] <- twosamples::wass_stat(Gjstar_i, Gij)
         clustgroups[[paste0("nk", nk)]] <- clust_k
@@ -493,7 +494,7 @@ knndm_geo <- function(tpoints, predpoints, k, maxp, clustering, linkf, islonglat
     if(isTRUE(islonglat)){
       Gjstar <- distclust_distmat(distmat, clust)
     }else{
-      Gjstar <- distclust_euclidean(tcoords, clust)
+      Gjstar <- distclust_euclidean(tcoords, clust,algorithm=algorithm)
     }
   }
 
@@ -509,7 +510,7 @@ knndm_geo <- function(tpoints, predpoints, k, maxp, clustering, linkf, islonglat
 
 
 # kNNDM in the feature space
-knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVars, useMD) {
+knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVars, useMD, algorithm) {
 
   # rescale data
   if(is.null(catVars)) {
@@ -576,8 +577,8 @@ knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVa
 
     } else {
       # use FNN with Euclidean distances if no categorical variables are present
-      Gj <- c(FNN::knn.dist(tpoints, k = 1))
-      Gij <- c(FNN::knnx.dist(query = predpoints, data = tpoints, k = 1))
+      Gj <- c(FNN::knn.dist(tpoints, k = 1, algorithm=algorithm))
+      Gij <- c(FNN::knnx.dist(query = predpoints, data = tpoints, k = 1, algorithm=algorithm))
     }
 
 
@@ -600,7 +601,7 @@ knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVa
       if(isTRUE(useMD)) {
         Gjstar <- distclust_MD(tpoints, clust)
       } else {
-        Gjstar <- distclust_euclidean(tpoints, clust)
+        Gjstar <- distclust_euclidean(tpoints, clust,algorithm=algorithm)
       }
 
     } else {
@@ -727,7 +728,7 @@ knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVa
               if(isTRUE(useMD)){
                 Gjstar_i <- distclust_MD(tpoints, clust_k)
               } else {
-                Gjstar_i <- distclust_euclidean(tpoints, clust_k)
+                Gjstar_i <- distclust_euclidean(tpoints, clust_k,algorithm=algorithm)
               }
             } else {
               Gjstar_i <- distclust_gower(tpoints, clust_k)
@@ -755,7 +756,7 @@ knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVa
         if(isTRUE(useMD)) {
           Gjstar <- distclust_MD(tpoints, clust)
         } else {
-          Gjstar <- distclust_euclidean(tpoints, clust)
+          Gjstar <- distclust_euclidean(tpoints, clust,algorithm=algorithm)
         }
 
       } else {
@@ -789,11 +790,11 @@ distclust_distmat <- function(distm, folds){
 }
 
 # Helper function: Compute out-of-fold NN distance (projected coordinates / numerical variables)
-distclust_euclidean <- function(tr_coords, folds){
+distclust_euclidean <- function(tr_coords, folds, algorithm){
   alldist <- rep(NA, length(folds))
   for(f in unique(folds)){
     alldist[f == folds] <- c(FNN::knnx.dist(query = tr_coords[f == folds,,drop=FALSE],
-                                            data = tr_coords[f != folds,,drop=FALSE], k = 1))
+                                            data = tr_coords[f != folds,,drop=FALSE], k = 1, algorithm=algorithm))
   }
   alldist
 }