From 9b18abb28720b5afea86fe39ba58ed9330d50af3 Mon Sep 17 00:00:00 2001 From: HannaMeyer Date: Thu, 9 Jan 2025 11:40:31 +0100 Subject: [PATCH] no memory leaks --- DESCRIPTION | 7 +++--- NEWS.md | 3 +++ R/aoa.R | 20 +++++++++-------- R/geodist.R | 28 +++++++++++++----------- R/knndm.R | 37 ++++++++++++++++---------------- R/nndm.R | 6 +++--- R/trainDI.R | 19 ++++++++-------- man/aoa.Rd | 5 ++++- man/geodist.Rd | 5 ++++- man/knndm.Rd | 5 ++++- man/nndm.Rd | 5 ++++- man/trainDI.Rd | 5 ++++- vignettes/cast01-CAST-intro.Rmd | 2 +- vignettes/cast02-plotgeodist.Rmd | 4 +++- 14 files changed, 89 insertions(+), 62 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c152437e..26f633f8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: CAST Type: Package Title: 'caret' Applications for Spatial-Temporal Models -Version: 1.0.2 +Version: 1.0.3 Authors@R: c(person("Hanna", "Meyer", email = "hanna.meyer@uni-muenster.de", role = c("cre", "aut")), person("Carles", "Milà", role = c("aut")), person("Marvin", "Ludwig", role = c("aut")), @@ -10,8 +10,9 @@ Authors@R: c(person("Hanna", "Meyer", email = "hanna.meyer@uni-muenster.de", rol person("Philipp", "Otto", role = c("ctb")), person("Chris", "Reudenbach", role = c("ctb")), person("Thomas", "Nauss", role = c("ctb")), - person("Edzer", "Pebesma", role = c("ctb"))) -Author: Hanna Meyer [cre, aut], Carles Milà [aut], Marvin Ludwig [aut], Jan Linnenbrink [aut], Fabian Schumacher [aut], Philipp Otto [ctb], Chris Reudenbach [ctb], Thomas Nauss [ctb], Edzer Pebesma [ctb] + person("Edzer", "Pebesma", role = c("ctb")), + person("Jakub", "Nowosad", role = c("ctb"))) +Author: Hanna Meyer [cre, aut], Carles Milà [aut], Marvin Ludwig [aut], Jan Linnenbrink [aut], Fabian Schumacher [aut], Philipp Otto [ctb], Chris Reudenbach [ctb], Thomas Nauss [ctb], Edzer Pebesma [ctb], Jakub Nowosad [ctb] Maintainer: Hanna Meyer Description: Supporting functionality to run 'caret' with spatial or spatial-temporal data. 'caret' is a frequently used package for model training and prediction using machine learning. CAST includes functions to improve spatial or spatial-temporal modelling tasks using 'caret'. It includes the newly suggested 'Nearest neighbor distance matching' cross-validation to estimate the performance of spatial prediction models and allows for spatial variable selection to selects suitable predictor variables in view to their contribution to the spatial model performance. CAST further includes functionality to estimate the (spatial) area of applicability of prediction models. Methods are described in Meyer et al. (2018) ; Meyer et al. (2019) ; Meyer and Pebesma (2021) ; Milà et al. (2022) ; Meyer and Pebesma (2022) ; Linnenbrink et al. (2023) ; Schumacher et al. (2024) . The package is described in detail in Meyer et al. (2024) . License: GPL (>= 2) diff --git a/NEWS.md b/NEWS.md index 7bb3743d..7dd8aff2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,6 @@ +# `CAST` 1.0.3 +* bug fix: default algorithm for FNN functions changed + # `CAST` 1.0.2 * bug fix: tests run conditionally diff --git a/R/aoa.R b/R/aoa.R index 1f45c883..9ae92a9d 100644 --- a/R/aoa.R +++ b/R/aoa.R @@ -31,6 +31,7 @@ #' @param maxLPD numeric or integer. Only if \code{LPD = TRUE}. Number of nearest neighbors to be considered for the calculation of the LPD. Either define a number between 0 and 1 to use a percentage of the number of training samples for the LPD calculation or a whole number larger than 1 and smaller than the number of training samples. CAUTION! If not all training samples are considered, a fitted relationship between LPD and error metric will not make sense (@seealso \code{\link{DItoErrormetric}}) #' @param indices logical. Calculate indices of the training data points that are responsible for the LPD of a new prediction location? Output is a matrix with the dimensions num(raster_cells) x maxLPD. Each row holds the indices of the training data points that are relevant for the specific LPD value at that location. Can be used in combination with exploreAOA(aoa) function from the \href{https://github.com/fab-scm/CASTvis}{CASTvis package} for a better visual interpretation of the results. Note that the matrix can be quite big for examples with a high resolution and a larger number of training samples, which can cause memory issues. #' @param verbose Logical. Print progress or not? +#' @param algorithm see \code{\link[FNN]{knnx.dist}} and \code{\link[FNN]{knnx.index}} #' @details The Dissimilarity Index (DI), the Local Data Point Density (LPD) and the corresponding Area of Applicability (AOA) are calculated. #' If variables are factors, dummy variables are created prior to weighting and distance calculation. #' @@ -157,7 +158,8 @@ aoa <- function(newdata, LPD = FALSE, maxLPD = 1, indices = FALSE, - verbose = TRUE) { + verbose = TRUE, + algorithm = "brute") { # handling of different raster formats as_stars <- FALSE @@ -214,7 +216,7 @@ aoa <- function(newdata, if (verbose) { message("No trainDI provided.") } - trainDI <- trainDI(model, train, variables, weight, CVtest, CVtrain, method, useWeight, useCV, LPD, verbose) + trainDI <- trainDI(model, train, variables, weight, CVtest, CVtrain, method, useWeight, useCV, LPD, verbose, algorithm=algorithm) } if (calc_LPD == TRUE) { @@ -314,7 +316,7 @@ aoa <- function(newdata, } mindist <- rep(NA, nrow(newdata)) mindist[okrows] <- - .mindistfun(newdataCC, train_scaled, method, S_inv) + .mindistfun(newdataCC, train_scaled, method, S_inv,algorithm=algorithm) DI_out <- mindist / trainDI$trainDist_avrgmean } @@ -335,13 +337,13 @@ aoa <- function(newdata, Indices_out <- matrix(NA, nrow = nrow(newdata), ncol = maxLPD) } for (i in seq(nrow(newdataCC))) { - knnDist <- .knndistfun(t(matrix(newdataCC[i,])), train_scaled, method, S_inv, maxLPD = maxLPD) + knnDist <- .knndistfun(t(matrix(newdataCC[i,])), train_scaled, method, S_inv, maxLPD = maxLPD, algorithm=algorithm) knnDI <- knnDist / trainDI$trainDist_avrgmean knnDI <- c(knnDI) DI_out[okrows[i]] <- knnDI[1] LPD_out[okrows[i]] <- sum(knnDI < trainDI$threshold) - knnIndex <- .knnindexfun(t(matrix(newdataCC[i,])), train_scaled, method, S_inv, maxLPD = LPD_out[okrows[i]]) + knnIndex <- .knnindexfun(t(matrix(newdataCC[i,])), train_scaled, method, S_inv, maxLPD = LPD_out[okrows[i]],algorithm=algorithm) if (indices) { if (LPD_out[okrows[i]] > 0) { @@ -449,10 +451,10 @@ aoa <- function(newdata, reference, method, S_inv = NULL, - maxLPD = maxLPD) { + maxLPD = maxLPD, algorithm) { if (method == "L2") { # Euclidean Distance - return(FNN::knnx.dist(reference, point, k = maxLPD)) + return(FNN::knnx.dist(reference, point, k = maxLPD, algorithm = algorithm)) } else if (method == "MD") { return(t(sapply(1:dim(point)[1], function(y) @@ -467,10 +469,10 @@ aoa <- function(newdata, reference, method, S_inv = NULL, - maxLPD = maxLPD) { + maxLPD = maxLPD, algorithm) { if (method == "L2") { # Euclidean Distance - return(FNN::knnx.index(reference, point, k = maxLPD)) + return(FNN::knnx.index(reference, point, k = maxLPD, algorithm = algorithm)) } else if (method == "MD") { stop("MD currently not implemented for LPD") } diff --git a/R/geodist.R b/R/geodist.R index dd542fd3..b8b09b47 100644 --- a/R/geodist.R +++ b/R/geodist.R @@ -16,6 +16,7 @@ #' @param variables character vector defining the predictor variables used if type="feature. If not provided all variables included in modeldomain are used. #' @param timevar optional. character. Column that indicates the date. Only used if type="time". #' @param time_unit optional. Character. Unit for temporal distances See ?difftime.Only used if type="time". +#' @param algorithm see \code{\link[FNN]{knnx.dist}} and \code{\link[FNN]{knnx.index}} #' @return A data.frame containing the distances. Unit of returned geographic distances is meters. attributes contain W statistic between prediction area and either sample data, CV folds or test data. See details. #' @details The modeldomain is a sf polygon or a raster that defines the prediction area. The function takes a regular point sample (amount defined by samplesize) from the spatial extent. #' If type = "feature", the argument modeldomain (and if provided then also the testdata and/or preddata) has to include predictors. Predictor values for x, testdata and preddata are optional if modeldomain is a raster. @@ -129,7 +130,8 @@ geodist <- function(x, sampling = "regular", variables=NULL, timevar=NULL, - time_unit="auto"){ + time_unit="auto", + algorithm="brute"){ # input formatting ------------ if(is.null(modeldomain)&!is.null(preddata)){ @@ -221,22 +223,22 @@ geodist <- function(x, } # always do sample-to-sample and sample-to-prediction - s2s <- sample2sample(x, type,variables,time_unit,timevar, catVars) - s2p <- sample2prediction(x, modeldomain, type, samplesize,variables,time_unit,timevar, catVars) + s2s <- sample2sample(x, type,variables,time_unit,timevar, catVars, algorithm=algorithm) + s2p <- sample2prediction(x, modeldomain, type, samplesize,variables,time_unit,timevar, catVars, algorithm=algorithm) dists <- rbind(s2s, s2p) # optional steps ---- ##### Distance to test data: if(!is.null(testdata)){ - s2t <- sample2test(x, testdata, type,variables,time_unit,timevar, catVars) + s2t <- sample2test(x, testdata, type,variables,time_unit,timevar, catVars, algorithm=algorithm) dists <- rbind(dists, s2t) } ##### Distance to CV data: if(!is.null(cvfolds)){ - cvd <- cvdistance(x, cvfolds, cvtrain, type, variables,time_unit,timevar, catVars) + cvd <- cvdistance(x, cvfolds, cvtrain, type, variables,time_unit,timevar, catVars, algorithm=algorithm) dists <- rbind(dists, cvd) } class(dists) <- c("geodist", class(dists)) @@ -270,7 +272,7 @@ geodist <- function(x, # Sample to Sample Distance -sample2sample <- function(x, type,variables,time_unit,timevar, catVars){ +sample2sample <- function(x, type,variables,time_unit,timevar, catVars, algorithm){ if(type == "geo"){ sf::sf_use_s2(TRUE) d <- sf::st_distance(x) @@ -301,7 +303,7 @@ sample2sample <- function(x, type,variables,time_unit,timevar, catVars){ for (i in 1:nrow(x_clean)){ if(is.null(catVars)) { - trainDist <- FNN::knnx.dist(x_clean[i,],x_clean,k=1) + trainDist <- FNN::knnx.dist(x_clean[i,],x_clean,k=1, algorithm=algorithm) } else { trainDist <- gower::gower_dist(x_clean[i,],x_clean) } @@ -331,7 +333,7 @@ sample2sample <- function(x, type,variables,time_unit,timevar, catVars){ # Sample to Prediction -sample2prediction = function(x, modeldomain, type, samplesize,variables,time_unit,timevar, catVars){ +sample2prediction = function(x, modeldomain, type, samplesize,variables,time_unit,timevar, catVars, algorithm){ if(type == "geo"){ modeldomain <- sf::st_transform(modeldomain, sf::st_crs(x)) @@ -378,7 +380,7 @@ sample2prediction = function(x, modeldomain, type, samplesize,variables,time_uni for (i in 1:nrow(modeldomain)){ if(is.null(catVars)) { - trainDist <- FNN::knnx.dist(modeldomain[i,],x_clean,k=1) + trainDist <- FNN::knnx.dist(modeldomain[i,],x_clean,k=1, algorithm=algorithm) } else { trainDist <- gower::gower_dist(modeldomain[i,], x_clean) } @@ -410,7 +412,7 @@ sample2prediction = function(x, modeldomain, type, samplesize,variables,time_uni # sample to test -sample2test <- function(x, testdata, type,variables,time_unit,timevar, catVars){ +sample2test <- function(x, testdata, type,variables,time_unit,timevar, catVars, algorithm){ if(type == "geo"){ testdata <- sf::st_transform(testdata,4326) @@ -460,7 +462,7 @@ sample2test <- function(x, testdata, type,variables,time_unit,timevar, catVars){ for (i in 1:nrow(testdata)){ if(is.null(catVars)) { - testDist <- FNN::knnx.dist(testdata[i,],x_clean,k=1) + testDist <- FNN::knnx.dist(testdata[i,],x_clean,k=1, algorithm=algorithm) } else { testDist <- gower::gower_dist(testdata[i,], x_clean) } @@ -491,7 +493,7 @@ sample2test <- function(x, testdata, type,variables,time_unit,timevar, catVars){ # between folds -cvdistance <- function(x, cvfolds, cvtrain, type, variables,time_unit,timevar, catVars){ +cvdistance <- function(x, cvfolds, cvtrain, type, variables,time_unit,timevar, catVars, algorithm){ if(!is.null(cvfolds)&!is.list(cvfolds)){ # restructure input if CVtest only contains the fold ID tmp <- list() @@ -550,7 +552,7 @@ cvdistance <- function(x, cvfolds, cvtrain, type, variables,time_unit,timevar, c for (k in 1:nrow(testdata_i)){ if(is.null(catVars)) { - trainDist <- tryCatch(FNN::knnx.dist(testdata_i[k,],traindata_i,k=1), + trainDist <- tryCatch(FNN::knnx.dist(testdata_i[k,],traindata_i,k=1, algorithm=algorithm), error = function(e)e) if(inherits(trainDist, "error")){ trainDist <- NA diff --git a/R/knndm.R b/R/knndm.R index 0044a106..7861189f 100644 --- a/R/knndm.R +++ b/R/knndm.R @@ -19,7 +19,7 @@ #' Only required if modeldomain is used instead of predpoints. #' @param useMD boolean. Only for `space`=feature: shall the Mahalanobis distance be calculated instead of Euclidean? #' Only works with numerical variables. -#' +#' @param algorithm see \code{\link[FNN]{knnx.dist}} and \code{\link[FNN]{knnx.index}} #' @return An object of class \emph{knndm} consisting of a list of eight elements: #' indx_train, indx_test (indices of the observations to use as #' training/test data in each kNNDM CV iteration), Gij (distances for @@ -205,7 +205,8 @@ knndm <- function(tpoints, modeldomain = NULL, predpoints = NULL, space = "geographical", k = 10, maxp = 0.5, clustering = "hierarchical", linkf = "ward.D2", - samplesize = 1000, sampling = "regular", useMD=FALSE){ + samplesize = 1000, sampling = "regular", useMD=FALSE, + algorithm="brute"){ # create sample points from modeldomain if(is.null(predpoints)&!is.null(modeldomain)){ @@ -312,14 +313,14 @@ knndm <- function(tpoints, modeldomain = NULL, predpoints = NULL, # prior checks check_knndm_geo(tpoints, predpoints, space, k, maxp, clustering, islonglat) # kNNDM in geographical space - knndm_res <- knndm_geo(tpoints, predpoints, k, maxp, clustering, linkf, islonglat) + knndm_res <- knndm_geo(tpoints, predpoints, k, maxp, clustering, linkf, islonglat, algorithm=algorithm) } else if (isTRUE(space == "feature")) { # prior checks check_knndm_feature(tpoints, predpoints, space, k, maxp, clustering, islonglat, catVars,useMD) # kNNDM in feature space - knndm_res <- knndm_feature(tpoints, predpoints, k, maxp, clustering, linkf, catVars, useMD) + knndm_res <- knndm_feature(tpoints, predpoints, k, maxp, clustering, linkf, catVars, useMD, algorithm=algorithm) } @@ -379,7 +380,7 @@ check_knndm_feature <- function(tpoints, predpoints, space, k, maxp, clustering, # kNNDM in the geographical space -knndm_geo <- function(tpoints, predpoints, k, maxp, clustering, linkf, islonglat){ +knndm_geo <- function(tpoints, predpoints, k, maxp, clustering, linkf, islonglat, algorithm){ # Gj and Gij calculation tcoords <- sf::st_coordinates(tpoints)[,1:2] @@ -392,9 +393,9 @@ knndm_geo <- function(tpoints, predpoints, k, maxp, clustering, linkf, islonglat units(Gij) <- NULL Gij <- apply(Gij, 1, min) }else{ - Gj <- c(FNN::knn.dist(tcoords, k = 1)) + Gj <- c(FNN::knn.dist(tcoords, k = 1, algorithm=algorithm)) Gij <- c(FNN::knnx.dist(query = sf::st_coordinates(predpoints)[,1:2], - data = tcoords, k = 1)) + data = tcoords, k = 1, algorithm=algorithm)) } # Check if Gj > Gij (warning suppressed regarding ties) @@ -406,7 +407,7 @@ knndm_geo <- function(tpoints, predpoints, k, maxp, clustering, linkf, islonglat if(isTRUE(islonglat)){ Gjstar <- distclust_distmat(distmat, clust) }else{ - Gjstar <- distclust_euclidean(tcoords, clust) + Gjstar <- distclust_euclidean(tcoords, clust, algorithm=algorithm) } k_final <- "random CV" W_final <- twosamples::wass_stat(Gjstar, Gij) @@ -479,7 +480,7 @@ knndm_geo <- function(tpoints, predpoints, k, maxp, clustering, linkf, islonglat if(isTRUE(islonglat)){ Gjstar_i <- distclust_distmat(distmat, clust_k) }else{ - Gjstar_i <- distclust_euclidean(tcoords, clust_k) + Gjstar_i <- distclust_euclidean(tcoords, clust_k,algorithm=algorithm) } clustgrid$W[clustgrid$nk==nk] <- twosamples::wass_stat(Gjstar_i, Gij) clustgroups[[paste0("nk", nk)]] <- clust_k @@ -493,7 +494,7 @@ knndm_geo <- function(tpoints, predpoints, k, maxp, clustering, linkf, islonglat if(isTRUE(islonglat)){ Gjstar <- distclust_distmat(distmat, clust) }else{ - Gjstar <- distclust_euclidean(tcoords, clust) + Gjstar <- distclust_euclidean(tcoords, clust,algorithm=algorithm) } } @@ -509,7 +510,7 @@ knndm_geo <- function(tpoints, predpoints, k, maxp, clustering, linkf, islonglat # kNNDM in the feature space -knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVars, useMD) { +knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVars, useMD, algorithm) { # rescale data if(is.null(catVars)) { @@ -576,8 +577,8 @@ knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVa } else { # use FNN with Euclidean distances if no categorical variables are present - Gj <- c(FNN::knn.dist(tpoints, k = 1)) - Gij <- c(FNN::knnx.dist(query = predpoints, data = tpoints, k = 1)) + Gj <- c(FNN::knn.dist(tpoints, k = 1, algorithm=algorithm)) + Gij <- c(FNN::knnx.dist(query = predpoints, data = tpoints, k = 1, algorithm=algorithm)) } @@ -600,7 +601,7 @@ knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVa if(isTRUE(useMD)) { Gjstar <- distclust_MD(tpoints, clust) } else { - Gjstar <- distclust_euclidean(tpoints, clust) + Gjstar <- distclust_euclidean(tpoints, clust,algorithm=algorithm) } } else { @@ -727,7 +728,7 @@ knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVa if(isTRUE(useMD)){ Gjstar_i <- distclust_MD(tpoints, clust_k) } else { - Gjstar_i <- distclust_euclidean(tpoints, clust_k) + Gjstar_i <- distclust_euclidean(tpoints, clust_k,algorithm=algorithm) } } else { Gjstar_i <- distclust_gower(tpoints, clust_k) @@ -755,7 +756,7 @@ knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVa if(isTRUE(useMD)) { Gjstar <- distclust_MD(tpoints, clust) } else { - Gjstar <- distclust_euclidean(tpoints, clust) + Gjstar <- distclust_euclidean(tpoints, clust,algorithm=algorithm) } } else { @@ -789,11 +790,11 @@ distclust_distmat <- function(distm, folds){ } # Helper function: Compute out-of-fold NN distance (projected coordinates / numerical variables) -distclust_euclidean <- function(tr_coords, folds){ +distclust_euclidean <- function(tr_coords, folds, algorithm){ alldist <- rep(NA, length(folds)) for(f in unique(folds)){ alldist[f == folds] <- c(FNN::knnx.dist(query = tr_coords[f == folds,,drop=FALSE], - data = tr_coords[f != folds,,drop=FALSE], k = 1)) + data = tr_coords[f != folds,,drop=FALSE], k = 1, algorithm=algorithm)) } alldist } diff --git a/R/nndm.R b/R/nndm.R index 881cb143..3f4bdaed 100644 --- a/R/nndm.R +++ b/R/nndm.R @@ -15,7 +15,7 @@ #' Per default (phi="max"), the maximum distance found in the training and prediction points is used. See Details. #' @param min_train Numeric between 0 and 1. Minimum proportion of training #' data that must be used in each CV fold. Defaults to 0.5 (i.e. half of the training points). -#' +#' @param algorithm see \code{\link[FNN]{knnx.dist}} and \code{\link[FNN]{knnx.index}} #' @return An object of class \emph{nndm} consisting of a list of six elements: #' indx_train, indx_test, and indx_exclude (indices of the observations to use as #' training/test/excluded data in each NNDM LOO CV iteration), Gij (distances for @@ -178,7 +178,7 @@ nndm <- function(tpoints, modeldomain = NULL, predpoints = NULL, space="geographical", samplesize = 1000, sampling = "regular", - phi = "max", min_train = 0.5){ + phi = "max", min_train = 0.5, algorithm="brute"){ # 1. Preprocessing actions ---- @@ -350,7 +350,7 @@ nndm <- function(tpoints, modeldomain = NULL, predpoints = NULL, if(is.null(catVars)) { # Euclidean distances if no categorical variables are present - Gij <- c(FNN::knnx.dist(query = predpoints, data = tpoints, k = 1)) + Gij <- c(FNN::knnx.dist(query = predpoints, data = tpoints, k = 1, algorithm=algorithm)) tdist <- as.matrix(stats::dist(tpoints, upper = TRUE)) diag(tdist) <- NA Gj <- apply(tdist, 1, function(x) min(x, na.rm=TRUE)) diff --git a/R/trainDI.R b/R/trainDI.R index ff5b381f..304842cb 100644 --- a/R/trainDI.R +++ b/R/trainDI.R @@ -26,7 +26,7 @@ #' @param useCV Logical. Only if a model is given. Use the CV folds to calculate the DI threshold? #' @param LPD Logical. Indicates whether the local point density should be calculated or not. #' @param verbose Logical. Print progress or not? -#' +#' @param algorithm see \code{\link[FNN]{knnx.dist}} and \code{\link[FNN]{knnx.index}} #' @seealso \code{\link{aoa}} #' @importFrom graphics boxplot #' @import ggplot2 @@ -104,7 +104,8 @@ trainDI <- function(model = NA, useWeight = TRUE, useCV =TRUE, LPD = FALSE, - verbose = TRUE){ + verbose = TRUE, + algorithm = "brute"){ # get parameters if they are not provided in function call----- if(is.null(train)){train = aoa_get_train(model)} @@ -194,7 +195,7 @@ trainDI <- function(model = NA, #trainDist_avrg <- append(trainDist_avrg, mean(trainDistAll, na.rm = TRUE)) # calculate distance to other training data: - trainDist <- matrix(.alldistfun(t(matrix(train[i,])), train, method, sorted = FALSE, S_inv)) + trainDist <- matrix(.alldistfun(t(matrix(train[i,])), train, method, sorted = FALSE, S_inv,algorithm=algorithm)) trainDist[i] <- NA trainDist_avrg <- append(trainDist_avrg, mean(trainDist, na.rm = TRUE)) @@ -261,7 +262,7 @@ trainDI <- function(model = NA, for (j in seq(nrow(train))) { # calculate distance to other training data: - trainDist <- .alldistfun(t(matrix(train[j,])), train, method, sorted = FALSE, S_inv) + trainDist <- .alldistfun(t(matrix(train[j,])), train, method, sorted = FALSE, S_inv,algorithm=algorithm) DItrainDist <- trainDist/trainDist_avrgmean DItrainDist[j] <- NA @@ -518,10 +519,10 @@ aoa_get_variables <- function(variables, model, train){ -.mindistfun <- function(point, reference, method, S_inv=NULL){ +.mindistfun <- function(point, reference, method, S_inv=NULL,algorithm){ if (method == "L2"){ # Euclidean Distance - return(c(FNN::knnx.dist(reference, point, k = 1))) + return(c(FNN::knnx.dist(reference, point, k = 1, algorithm = algorithm))) } else if (method == "MD"){ # Mahalanobis Distance return(sapply(1:dim(point)[1], function(y) min(sapply(1:dim(reference)[1], @@ -529,13 +530,13 @@ aoa_get_variables <- function(variables, model, train){ } } -.alldistfun <- function(point, reference, method, sorted = TRUE,S_inv=NULL){ +.alldistfun <- function(point, reference, method, sorted = TRUE,S_inv=NULL,algorithm){ if (method == "L2"){ # Euclidean Distance if(sorted){ - return(FNN::knnx.dist(reference, point, k = dim(reference)[1])) + return(FNN::knnx.dist(reference, point, k = dim(reference)[1], algorithm = algorithm)) } else { - return(FNN::knnx.dist(point,reference,k=1)) + return(FNN::knnx.dist(point,reference,k=1, algorithm=algorithm)) } } else if (method == "MD"){ # Mahalanobis Distance if(sorted){ diff --git a/man/aoa.Rd b/man/aoa.Rd index 78408968..52528b5b 100644 --- a/man/aoa.Rd +++ b/man/aoa.Rd @@ -19,7 +19,8 @@ aoa( LPD = FALSE, maxLPD = 1, indices = FALSE, - verbose = TRUE + verbose = TRUE, + algorithm = "brute" ) } \arguments{ @@ -59,6 +60,8 @@ Relevant if some data points are excluded, e.g. when using \code{\link{nndm}}.} \item{indices}{logical. Calculate indices of the training data points that are responsible for the LPD of a new prediction location? Output is a matrix with the dimensions num(raster_cells) x maxLPD. Each row holds the indices of the training data points that are relevant for the specific LPD value at that location. Can be used in combination with exploreAOA(aoa) function from the \href{https://github.com/fab-scm/CASTvis}{CASTvis package} for a better visual interpretation of the results. Note that the matrix can be quite big for examples with a high resolution and a larger number of training samples, which can cause memory issues.} \item{verbose}{Logical. Print progress or not?} + +\item{algorithm}{see \code{\link[FNN]{knnx.dist}} and \code{\link[FNN]{knnx.index}}} } \value{ An object of class \code{aoa} containing: diff --git a/man/geodist.Rd b/man/geodist.Rd index 58b0242e..57c193fc 100644 --- a/man/geodist.Rd +++ b/man/geodist.Rd @@ -16,7 +16,8 @@ geodist( sampling = "regular", variables = NULL, timevar = NULL, - time_unit = "auto" + time_unit = "auto", + algorithm = "brute" ) } \arguments{ @@ -45,6 +46,8 @@ locations within the modeldomain rather than the whole area.} \item{timevar}{optional. character. Column that indicates the date. Only used if type="time".} \item{time_unit}{optional. Character. Unit for temporal distances See ?difftime.Only used if type="time".} + +\item{algorithm}{see \code{\link[FNN]{knnx.dist}} and \code{\link[FNN]{knnx.index}}} } \value{ A data.frame containing the distances. Unit of returned geographic distances is meters. attributes contain W statistic between prediction area and either sample data, CV folds or test data. See details. diff --git a/man/knndm.Rd b/man/knndm.Rd index 4b5f8a58..5e9d3563 100644 --- a/man/knndm.Rd +++ b/man/knndm.Rd @@ -15,7 +15,8 @@ knndm( linkf = "ward.D2", samplesize = 1000, sampling = "regular", - useMD = FALSE + useMD = FALSE, + algorithm = "brute" ) } \arguments{ @@ -44,6 +45,8 @@ Only required if modeldomain is used instead of predpoints.} \item{useMD}{boolean. Only for `space`=feature: shall the Mahalanobis distance be calculated instead of Euclidean? Only works with numerical variables.} + +\item{algorithm}{see \code{\link[FNN]{knnx.dist}} and \code{\link[FNN]{knnx.index}}} } \value{ An object of class \emph{knndm} consisting of a list of eight elements: diff --git a/man/nndm.Rd b/man/nndm.Rd index 45946fde..10f22db8 100644 --- a/man/nndm.Rd +++ b/man/nndm.Rd @@ -12,7 +12,8 @@ nndm( samplesize = 1000, sampling = "regular", phi = "max", - min_train = 0.5 + min_train = 0.5, + algorithm = "brute" ) } \arguments{ @@ -36,6 +37,8 @@ Per default (phi="max"), the maximum distance found in the training and predicti \item{min_train}{Numeric between 0 and 1. Minimum proportion of training data that must be used in each CV fold. Defaults to 0.5 (i.e. half of the training points).} + +\item{algorithm}{see \code{\link[FNN]{knnx.dist}} and \code{\link[FNN]{knnx.index}}} } \value{ An object of class \emph{nndm} consisting of a list of six elements: diff --git a/man/trainDI.Rd b/man/trainDI.Rd index 662a7ebc..8ce39325 100644 --- a/man/trainDI.Rd +++ b/man/trainDI.Rd @@ -15,7 +15,8 @@ trainDI( useWeight = TRUE, useCV = TRUE, LPD = FALSE, - verbose = TRUE + verbose = TRUE, + algorithm = "brute" ) } \arguments{ @@ -45,6 +46,8 @@ Relevant if some data points are excluded, e.g. when using \code{\link{nndm}}.} \item{LPD}{Logical. Indicates whether the local point density should be calculated or not.} \item{verbose}{Logical. Print progress or not?} + +\item{algorithm}{see \code{\link[FNN]{knnx.dist}} and \code{\link[FNN]{knnx.index}}} } \value{ A list of class \code{trainDI} containing: diff --git a/vignettes/cast01-CAST-intro.Rmd b/vignettes/cast01-CAST-intro.Rmd index f1a09970..cf61885b 100644 --- a/vignettes/cast01-CAST-intro.Rmd +++ b/vignettes/cast01-CAST-intro.Rmd @@ -252,7 +252,7 @@ Still it is required to analyse if the model can be applied to the entire study ```{r, message = FALSE, warning=FALSE} ### AOA for which the spatial CV error applies: -AOA <- aoa(predictors_sp,ffsmodel,LPD = TRUE,verbose=FALSE) +AOA <- aoa(predictors_sp,ffsmodel,LPD = TRUE, verbose=FALSE) # tmap3 tm_shape(prediction)+ diff --git a/vignettes/cast02-plotgeodist.Rmd b/vignettes/cast02-plotgeodist.Rmd index fa4084f0..6835a5e4 100644 --- a/vignettes/cast02-plotgeodist.Rmd +++ b/vignettes/cast02-plotgeodist.Rmd @@ -7,8 +7,10 @@ output: toc: true vignette: > %\VignetteIndexEntry{Visualization of nearest neighbor distance distributions} - %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} + %\VignetteEngine{knitr::rmarkdown} +editor_options: + chunk_output_type: console --- ```{r setup, include=FALSE}