Skip to content

Commit

Permalink
no memory leaks
Browse files Browse the repository at this point in the history
  • Loading branch information
HannaMeyer committed Jan 9, 2025
1 parent e7bd2fa commit 9b18abb
Show file tree
Hide file tree
Showing 14 changed files with 89 additions and 62 deletions.
7 changes: 4 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: CAST
Type: Package
Title: 'caret' Applications for Spatial-Temporal Models
Version: 1.0.2
Version: 1.0.3
Authors@R: c(person("Hanna", "Meyer", email = "hanna.meyer@uni-muenster.de", role = c("cre", "aut")),
person("Carles", "Milà", role = c("aut")),
person("Marvin", "Ludwig", role = c("aut")),
Expand All @@ -10,8 +10,9 @@ Authors@R: c(person("Hanna", "Meyer", email = "hanna.meyer@uni-muenster.de", rol
person("Philipp", "Otto", role = c("ctb")),
person("Chris", "Reudenbach", role = c("ctb")),
person("Thomas", "Nauss", role = c("ctb")),
person("Edzer", "Pebesma", role = c("ctb")))
Author: Hanna Meyer [cre, aut], Carles Milà [aut], Marvin Ludwig [aut], Jan Linnenbrink [aut], Fabian Schumacher [aut], Philipp Otto [ctb], Chris Reudenbach [ctb], Thomas Nauss [ctb], Edzer Pebesma [ctb]
person("Edzer", "Pebesma", role = c("ctb")),
person("Jakub", "Nowosad", role = c("ctb")))
Author: Hanna Meyer [cre, aut], Carles Milà [aut], Marvin Ludwig [aut], Jan Linnenbrink [aut], Fabian Schumacher [aut], Philipp Otto [ctb], Chris Reudenbach [ctb], Thomas Nauss [ctb], Edzer Pebesma [ctb], Jakub Nowosad [ctb]
Maintainer: Hanna Meyer <hanna.meyer@uni-muenster.de>
Description: Supporting functionality to run 'caret' with spatial or spatial-temporal data. 'caret' is a frequently used package for model training and prediction using machine learning. CAST includes functions to improve spatial or spatial-temporal modelling tasks using 'caret'. It includes the newly suggested 'Nearest neighbor distance matching' cross-validation to estimate the performance of spatial prediction models and allows for spatial variable selection to selects suitable predictor variables in view to their contribution to the spatial model performance. CAST further includes functionality to estimate the (spatial) area of applicability of prediction models. Methods are described in Meyer et al. (2018) <doi:10.1016/j.envsoft.2017.12.001>; Meyer et al. (2019) <doi:10.1016/j.ecolmodel.2019.108815>; Meyer and Pebesma (2021) <doi:10.1111/2041-210X.13650>; Milà et al. (2022) <doi:10.1111/2041-210X.13851>; Meyer and Pebesma (2022) <doi:10.1038/s41467-022-29838-9>; Linnenbrink et al. (2023) <doi:10.5194/egusphere-2023-1308>; Schumacher et al. (2024) <doi:10.5194/egusphere-2024-2730>. The package is described in detail in Meyer et al. (2024) <doi:10.48550/arXiv.2404.06978>.
License: GPL (>= 2)
Expand Down
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# `CAST` 1.0.3
* bug fix: default algorithm for FNN functions changed

# `CAST` 1.0.2
* bug fix: tests run conditionally

Expand Down
20 changes: 11 additions & 9 deletions R/aoa.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#' @param maxLPD numeric or integer. Only if \code{LPD = TRUE}. Number of nearest neighbors to be considered for the calculation of the LPD. Either define a number between 0 and 1 to use a percentage of the number of training samples for the LPD calculation or a whole number larger than 1 and smaller than the number of training samples. CAUTION! If not all training samples are considered, a fitted relationship between LPD and error metric will not make sense (@seealso \code{\link{DItoErrormetric}})
#' @param indices logical. Calculate indices of the training data points that are responsible for the LPD of a new prediction location? Output is a matrix with the dimensions num(raster_cells) x maxLPD. Each row holds the indices of the training data points that are relevant for the specific LPD value at that location. Can be used in combination with exploreAOA(aoa) function from the \href{https://github.com/fab-scm/CASTvis}{CASTvis package} for a better visual interpretation of the results. Note that the matrix can be quite big for examples with a high resolution and a larger number of training samples, which can cause memory issues.
#' @param verbose Logical. Print progress or not?
#' @param algorithm see \code{\link[FNN]{knnx.dist}} and \code{\link[FNN]{knnx.index}}
#' @details The Dissimilarity Index (DI), the Local Data Point Density (LPD) and the corresponding Area of Applicability (AOA) are calculated.
#' If variables are factors, dummy variables are created prior to weighting and distance calculation.
#'
Expand Down Expand Up @@ -157,7 +158,8 @@ aoa <- function(newdata,
LPD = FALSE,
maxLPD = 1,
indices = FALSE,
verbose = TRUE) {
verbose = TRUE,
algorithm = "brute") {

# handling of different raster formats
as_stars <- FALSE
Expand Down Expand Up @@ -214,7 +216,7 @@ aoa <- function(newdata,
if (verbose) {
message("No trainDI provided.")
}
trainDI <- trainDI(model, train, variables, weight, CVtest, CVtrain, method, useWeight, useCV, LPD, verbose)
trainDI <- trainDI(model, train, variables, weight, CVtest, CVtrain, method, useWeight, useCV, LPD, verbose, algorithm=algorithm)
}

if (calc_LPD == TRUE) {
Expand Down Expand Up @@ -314,7 +316,7 @@ aoa <- function(newdata,
}
mindist <- rep(NA, nrow(newdata))
mindist[okrows] <-
.mindistfun(newdataCC, train_scaled, method, S_inv)
.mindistfun(newdataCC, train_scaled, method, S_inv,algorithm=algorithm)
DI_out <- mindist / trainDI$trainDist_avrgmean
}

Expand All @@ -335,13 +337,13 @@ aoa <- function(newdata,
Indices_out <- matrix(NA, nrow = nrow(newdata), ncol = maxLPD)
}
for (i in seq(nrow(newdataCC))) {
knnDist <- .knndistfun(t(matrix(newdataCC[i,])), train_scaled, method, S_inv, maxLPD = maxLPD)
knnDist <- .knndistfun(t(matrix(newdataCC[i,])), train_scaled, method, S_inv, maxLPD = maxLPD, algorithm=algorithm)
knnDI <- knnDist / trainDI$trainDist_avrgmean
knnDI <- c(knnDI)

DI_out[okrows[i]] <- knnDI[1]
LPD_out[okrows[i]] <- sum(knnDI < trainDI$threshold)
knnIndex <- .knnindexfun(t(matrix(newdataCC[i,])), train_scaled, method, S_inv, maxLPD = LPD_out[okrows[i]])
knnIndex <- .knnindexfun(t(matrix(newdataCC[i,])), train_scaled, method, S_inv, maxLPD = LPD_out[okrows[i]],algorithm=algorithm)

if (indices) {
if (LPD_out[okrows[i]] > 0) {
Expand Down Expand Up @@ -449,10 +451,10 @@ aoa <- function(newdata,
reference,
method,
S_inv = NULL,
maxLPD = maxLPD) {
maxLPD = maxLPD, algorithm) {
if (method == "L2") {
# Euclidean Distance
return(FNN::knnx.dist(reference, point, k = maxLPD))
return(FNN::knnx.dist(reference, point, k = maxLPD, algorithm = algorithm))
} else if (method == "MD") {
return(t(sapply(1:dim(point)[1],
function(y)
Expand All @@ -467,10 +469,10 @@ aoa <- function(newdata,
reference,
method,
S_inv = NULL,
maxLPD = maxLPD) {
maxLPD = maxLPD, algorithm) {
if (method == "L2") {
# Euclidean Distance
return(FNN::knnx.index(reference, point, k = maxLPD))
return(FNN::knnx.index(reference, point, k = maxLPD, algorithm = algorithm))
} else if (method == "MD") {
stop("MD currently not implemented for LPD")
}
Expand Down
28 changes: 15 additions & 13 deletions R/geodist.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#' @param variables character vector defining the predictor variables used if type="feature. If not provided all variables included in modeldomain are used.
#' @param timevar optional. character. Column that indicates the date. Only used if type="time".
#' @param time_unit optional. Character. Unit for temporal distances See ?difftime.Only used if type="time".
#' @param algorithm see \code{\link[FNN]{knnx.dist}} and \code{\link[FNN]{knnx.index}}
#' @return A data.frame containing the distances. Unit of returned geographic distances is meters. attributes contain W statistic between prediction area and either sample data, CV folds or test data. See details.
#' @details The modeldomain is a sf polygon or a raster that defines the prediction area. The function takes a regular point sample (amount defined by samplesize) from the spatial extent.
#' If type = "feature", the argument modeldomain (and if provided then also the testdata and/or preddata) has to include predictors. Predictor values for x, testdata and preddata are optional if modeldomain is a raster.
Expand Down Expand Up @@ -129,7 +130,8 @@ geodist <- function(x,
sampling = "regular",
variables=NULL,
timevar=NULL,
time_unit="auto"){
time_unit="auto",
algorithm="brute"){

# input formatting ------------
if(is.null(modeldomain)&!is.null(preddata)){
Expand Down Expand Up @@ -221,22 +223,22 @@ geodist <- function(x,
}

# always do sample-to-sample and sample-to-prediction
s2s <- sample2sample(x, type,variables,time_unit,timevar, catVars)
s2p <- sample2prediction(x, modeldomain, type, samplesize,variables,time_unit,timevar, catVars)
s2s <- sample2sample(x, type,variables,time_unit,timevar, catVars, algorithm=algorithm)
s2p <- sample2prediction(x, modeldomain, type, samplesize,variables,time_unit,timevar, catVars, algorithm=algorithm)

dists <- rbind(s2s, s2p)

# optional steps ----
##### Distance to test data:
if(!is.null(testdata)){
s2t <- sample2test(x, testdata, type,variables,time_unit,timevar, catVars)
s2t <- sample2test(x, testdata, type,variables,time_unit,timevar, catVars, algorithm=algorithm)
dists <- rbind(dists, s2t)
}

##### Distance to CV data:
if(!is.null(cvfolds)){

cvd <- cvdistance(x, cvfolds, cvtrain, type, variables,time_unit,timevar, catVars)
cvd <- cvdistance(x, cvfolds, cvtrain, type, variables,time_unit,timevar, catVars, algorithm=algorithm)
dists <- rbind(dists, cvd)
}
class(dists) <- c("geodist", class(dists))
Expand Down Expand Up @@ -270,7 +272,7 @@ geodist <- function(x,

# Sample to Sample Distance

sample2sample <- function(x, type,variables,time_unit,timevar, catVars){
sample2sample <- function(x, type,variables,time_unit,timevar, catVars, algorithm){
if(type == "geo"){
sf::sf_use_s2(TRUE)
d <- sf::st_distance(x)
Expand Down Expand Up @@ -301,7 +303,7 @@ sample2sample <- function(x, type,variables,time_unit,timevar, catVars){
for (i in 1:nrow(x_clean)){

if(is.null(catVars)) {
trainDist <- FNN::knnx.dist(x_clean[i,],x_clean,k=1)
trainDist <- FNN::knnx.dist(x_clean[i,],x_clean,k=1, algorithm=algorithm)
} else {
trainDist <- gower::gower_dist(x_clean[i,],x_clean)
}
Expand Down Expand Up @@ -331,7 +333,7 @@ sample2sample <- function(x, type,variables,time_unit,timevar, catVars){


# Sample to Prediction
sample2prediction = function(x, modeldomain, type, samplesize,variables,time_unit,timevar, catVars){
sample2prediction = function(x, modeldomain, type, samplesize,variables,time_unit,timevar, catVars, algorithm){

if(type == "geo"){
modeldomain <- sf::st_transform(modeldomain, sf::st_crs(x))
Expand Down Expand Up @@ -378,7 +380,7 @@ sample2prediction = function(x, modeldomain, type, samplesize,variables,time_uni
for (i in 1:nrow(modeldomain)){

if(is.null(catVars)) {
trainDist <- FNN::knnx.dist(modeldomain[i,],x_clean,k=1)
trainDist <- FNN::knnx.dist(modeldomain[i,],x_clean,k=1, algorithm=algorithm)
} else {
trainDist <- gower::gower_dist(modeldomain[i,], x_clean)
}
Expand Down Expand Up @@ -410,7 +412,7 @@ sample2prediction = function(x, modeldomain, type, samplesize,variables,time_uni
# sample to test


sample2test <- function(x, testdata, type,variables,time_unit,timevar, catVars){
sample2test <- function(x, testdata, type,variables,time_unit,timevar, catVars, algorithm){

if(type == "geo"){
testdata <- sf::st_transform(testdata,4326)
Expand Down Expand Up @@ -460,7 +462,7 @@ sample2test <- function(x, testdata, type,variables,time_unit,timevar, catVars){
for (i in 1:nrow(testdata)){

if(is.null(catVars)) {
testDist <- FNN::knnx.dist(testdata[i,],x_clean,k=1)
testDist <- FNN::knnx.dist(testdata[i,],x_clean,k=1, algorithm=algorithm)
} else {
testDist <- gower::gower_dist(testdata[i,], x_clean)
}
Expand Down Expand Up @@ -491,7 +493,7 @@ sample2test <- function(x, testdata, type,variables,time_unit,timevar, catVars){

# between folds

cvdistance <- function(x, cvfolds, cvtrain, type, variables,time_unit,timevar, catVars){
cvdistance <- function(x, cvfolds, cvtrain, type, variables,time_unit,timevar, catVars, algorithm){

if(!is.null(cvfolds)&!is.list(cvfolds)){ # restructure input if CVtest only contains the fold ID
tmp <- list()
Expand Down Expand Up @@ -550,7 +552,7 @@ cvdistance <- function(x, cvfolds, cvtrain, type, variables,time_unit,timevar, c
for (k in 1:nrow(testdata_i)){

if(is.null(catVars)) {
trainDist <- tryCatch(FNN::knnx.dist(testdata_i[k,],traindata_i,k=1),
trainDist <- tryCatch(FNN::knnx.dist(testdata_i[k,],traindata_i,k=1, algorithm=algorithm),
error = function(e)e)
if(inherits(trainDist, "error")){
trainDist <- NA
Expand Down
37 changes: 19 additions & 18 deletions R/knndm.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
#' Only required if modeldomain is used instead of predpoints.
#' @param useMD boolean. Only for `space`=feature: shall the Mahalanobis distance be calculated instead of Euclidean?
#' Only works with numerical variables.
#'
#' @param algorithm see \code{\link[FNN]{knnx.dist}} and \code{\link[FNN]{knnx.index}}
#' @return An object of class \emph{knndm} consisting of a list of eight elements:
#' indx_train, indx_test (indices of the observations to use as
#' training/test data in each kNNDM CV iteration), Gij (distances for
Expand Down Expand Up @@ -205,7 +205,8 @@ knndm <- function(tpoints, modeldomain = NULL, predpoints = NULL,
space = "geographical",
k = 10, maxp = 0.5,
clustering = "hierarchical", linkf = "ward.D2",
samplesize = 1000, sampling = "regular", useMD=FALSE){
samplesize = 1000, sampling = "regular", useMD=FALSE,
algorithm="brute"){

# create sample points from modeldomain
if(is.null(predpoints)&!is.null(modeldomain)){
Expand Down Expand Up @@ -312,14 +313,14 @@ knndm <- function(tpoints, modeldomain = NULL, predpoints = NULL,
# prior checks
check_knndm_geo(tpoints, predpoints, space, k, maxp, clustering, islonglat)
# kNNDM in geographical space
knndm_res <- knndm_geo(tpoints, predpoints, k, maxp, clustering, linkf, islonglat)
knndm_res <- knndm_geo(tpoints, predpoints, k, maxp, clustering, linkf, islonglat, algorithm=algorithm)

} else if (isTRUE(space == "feature")) {

# prior checks
check_knndm_feature(tpoints, predpoints, space, k, maxp, clustering, islonglat, catVars,useMD)
# kNNDM in feature space
knndm_res <- knndm_feature(tpoints, predpoints, k, maxp, clustering, linkf, catVars, useMD)
knndm_res <- knndm_feature(tpoints, predpoints, k, maxp, clustering, linkf, catVars, useMD, algorithm=algorithm)

}

Expand Down Expand Up @@ -379,7 +380,7 @@ check_knndm_feature <- function(tpoints, predpoints, space, k, maxp, clustering,


# kNNDM in the geographical space
knndm_geo <- function(tpoints, predpoints, k, maxp, clustering, linkf, islonglat){
knndm_geo <- function(tpoints, predpoints, k, maxp, clustering, linkf, islonglat, algorithm){

# Gj and Gij calculation
tcoords <- sf::st_coordinates(tpoints)[,1:2]
Expand All @@ -392,9 +393,9 @@ knndm_geo <- function(tpoints, predpoints, k, maxp, clustering, linkf, islonglat
units(Gij) <- NULL
Gij <- apply(Gij, 1, min)
}else{
Gj <- c(FNN::knn.dist(tcoords, k = 1))
Gj <- c(FNN::knn.dist(tcoords, k = 1, algorithm=algorithm))
Gij <- c(FNN::knnx.dist(query = sf::st_coordinates(predpoints)[,1:2],
data = tcoords, k = 1))
data = tcoords, k = 1, algorithm=algorithm))
}

# Check if Gj > Gij (warning suppressed regarding ties)
Expand All @@ -406,7 +407,7 @@ knndm_geo <- function(tpoints, predpoints, k, maxp, clustering, linkf, islonglat
if(isTRUE(islonglat)){
Gjstar <- distclust_distmat(distmat, clust)
}else{
Gjstar <- distclust_euclidean(tcoords, clust)
Gjstar <- distclust_euclidean(tcoords, clust, algorithm=algorithm)
}
k_final <- "random CV"
W_final <- twosamples::wass_stat(Gjstar, Gij)
Expand Down Expand Up @@ -479,7 +480,7 @@ knndm_geo <- function(tpoints, predpoints, k, maxp, clustering, linkf, islonglat
if(isTRUE(islonglat)){
Gjstar_i <- distclust_distmat(distmat, clust_k)
}else{
Gjstar_i <- distclust_euclidean(tcoords, clust_k)
Gjstar_i <- distclust_euclidean(tcoords, clust_k,algorithm=algorithm)
}
clustgrid$W[clustgrid$nk==nk] <- twosamples::wass_stat(Gjstar_i, Gij)
clustgroups[[paste0("nk", nk)]] <- clust_k
Expand All @@ -493,7 +494,7 @@ knndm_geo <- function(tpoints, predpoints, k, maxp, clustering, linkf, islonglat
if(isTRUE(islonglat)){
Gjstar <- distclust_distmat(distmat, clust)
}else{
Gjstar <- distclust_euclidean(tcoords, clust)
Gjstar <- distclust_euclidean(tcoords, clust,algorithm=algorithm)
}
}

Expand All @@ -509,7 +510,7 @@ knndm_geo <- function(tpoints, predpoints, k, maxp, clustering, linkf, islonglat


# kNNDM in the feature space
knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVars, useMD) {
knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVars, useMD, algorithm) {

# rescale data
if(is.null(catVars)) {
Expand Down Expand Up @@ -576,8 +577,8 @@ knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVa

} else {
# use FNN with Euclidean distances if no categorical variables are present
Gj <- c(FNN::knn.dist(tpoints, k = 1))
Gij <- c(FNN::knnx.dist(query = predpoints, data = tpoints, k = 1))
Gj <- c(FNN::knn.dist(tpoints, k = 1, algorithm=algorithm))
Gij <- c(FNN::knnx.dist(query = predpoints, data = tpoints, k = 1, algorithm=algorithm))
}


Expand All @@ -600,7 +601,7 @@ knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVa
if(isTRUE(useMD)) {
Gjstar <- distclust_MD(tpoints, clust)
} else {
Gjstar <- distclust_euclidean(tpoints, clust)
Gjstar <- distclust_euclidean(tpoints, clust,algorithm=algorithm)
}

} else {
Expand Down Expand Up @@ -727,7 +728,7 @@ knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVa
if(isTRUE(useMD)){
Gjstar_i <- distclust_MD(tpoints, clust_k)
} else {
Gjstar_i <- distclust_euclidean(tpoints, clust_k)
Gjstar_i <- distclust_euclidean(tpoints, clust_k,algorithm=algorithm)
}
} else {
Gjstar_i <- distclust_gower(tpoints, clust_k)
Expand Down Expand Up @@ -755,7 +756,7 @@ knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVa
if(isTRUE(useMD)) {
Gjstar <- distclust_MD(tpoints, clust)
} else {
Gjstar <- distclust_euclidean(tpoints, clust)
Gjstar <- distclust_euclidean(tpoints, clust,algorithm=algorithm)
}

} else {
Expand Down Expand Up @@ -789,11 +790,11 @@ distclust_distmat <- function(distm, folds){
}

# Helper function: Compute out-of-fold NN distance (projected coordinates / numerical variables)
distclust_euclidean <- function(tr_coords, folds){
distclust_euclidean <- function(tr_coords, folds, algorithm){
alldist <- rep(NA, length(folds))
for(f in unique(folds)){
alldist[f == folds] <- c(FNN::knnx.dist(query = tr_coords[f == folds,,drop=FALSE],
data = tr_coords[f != folds,,drop=FALSE], k = 1))
data = tr_coords[f != folds,,drop=FALSE], k = 1, algorithm=algorithm))
}
alldist
}
Expand Down
Loading

0 comments on commit 9b18abb

Please sign in to comment.