Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev #12

Merged
merged 41 commits into from
Dec 8, 2023
Merged

Dev #12

Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
f86dbad
update plot and print
fab-scm Oct 3, 2023
6b075f4
add exploreAOA to package
fab-scm Oct 4, 2023
accebdb
add message for trainLPD calculation
fab-scm Oct 4, 2023
50c8387
update explore AOA docs
fab-scm Oct 4, 2023
1f0a870
shift package to description
fab-scm Oct 4, 2023
ddc7473
update exploreAOA docs
fab-scm Oct 5, 2023
4858169
update exploreAOA docs and add xmark
fab-scm Oct 5, 2023
3ce69a0
update aoa, trainDI and exploreAOA docs
fab-scm Oct 5, 2023
60e3149
update aoa docs
fab-scm Oct 5, 2023
802a17f
change order of print.aoa
fab-scm Oct 5, 2023
43777ca
update aoa.Rd
fab-scm Oct 5, 2023
62dc013
update DESCRIPTION
fab-scm Oct 5, 2023
c30c2fd
Merge branch 'master' into dev
fab-scm Oct 5, 2023
720f7c1
update aoa & trainDI
fab-scm Oct 17, 2023
33032dd
add trainDat upload options
fab-scm Oct 30, 2023
b6175e2
raster to terra and new layout (#5)
fab-scm Nov 2, 2023
3c384db
update plot function
fab-scm Nov 8, 2023
f1143d5
update exploreAOA
fab-scm Nov 8, 2023
1f228dd
update exploreAOA
fab-scm Nov 8, 2023
aff07d8
fix error
fab-scm Nov 8, 2023
abba3c9
fix error
fab-scm Nov 8, 2023
e463d54
fix crs bounding box issue
fab-scm Nov 8, 2023
7e57eb9
rm commented code
fab-scm Nov 8, 2023
0363233
update plot.aoa & exploreAOA
fab-scm Nov 9, 2023
28e5628
fix error in trainLPD calculation
fab-scm Nov 13, 2023
0668d40
fix MD calculation
fab-scm Nov 13, 2023
2971b93
fix order in print.aoa & message in aoa()
fab-scm Nov 14, 2023
eb89f70
add predictors tif to CAST package
fab-scm Nov 14, 2023
ed09047
add countryboundaries
fab-scm Nov 15, 2023
817f938
add elevation
fab-scm Nov 16, 2023
c106491
Merge branch 'master' into dev
fab-scm Nov 20, 2023
8d85c9f
Change `maxLPD` parameter handling & enable `trainLPD` calculation wi…
fab-scm Nov 20, 2023
64b0d42
Add `LPDtoErrormetric` functionality (#8)
fab-scm Nov 20, 2023
1402829
minor changes on docs and comments
fab-scm Nov 20, 2023
012e617
major docs update
fab-scm Nov 20, 2023
6cc7dfa
update DItoErrormetric and LPDtoErrormetric
fab-scm Nov 20, 2023
53aa2c9
New calculation methods for LPD of training data and LPD of predictio…
fab-scm Dec 8, 2023
d7269f3
Merge branch 'HannaMeyer:master' into dev
fab-scm Dec 8, 2023
f2f5c16
Feat train di new (#11)
fab-scm Dec 8, 2023
b18bd01
fix error in LPDtoErrormetric
fab-scm Dec 8, 2023
3fa269d
Merge branch 'master' into dev
fab-scm Dec 8, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
# Generated by roxygen2: do not edit by hand

S3method(plot,aoa)
S3method(plot,errorModel)
S3method(plot,errorModelDI)
S3method(plot,errorModelDI_LPD)
S3method(plot,errorModelLPD)
S3method(plot,ffs)
S3method(plot,geodist)
S3method(plot,knndm)
Expand All @@ -13,7 +15,9 @@ S3method(print,knndm)
S3method(print,nndm)
S3method(print,trainDI)
export(CreateSpacetimeFolds)
export(DI_LPDtoErrormetric)
export(DItoErrormetric)
export(LPDtoErrormetric)
export(aoa)
export(bss)
export(calibrate_aoa)
Expand Down
264 changes: 264 additions & 0 deletions R/DI_LPDtoErrormetric.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,264 @@
#' Model the relationship between the DI and LPD and the prediction error
#' @description Performance metrics are calculated for moving windows of DI values of cross-validated training data
#' @param model the model used to get the AOA
#' @param trainDI the result of \code{\link{trainDI}} or aoa object \code{\link{aoa}}
#' @param multiCV Logical. Re-run model fitting and validation with different CV strategies. See details.
#' @param window.size Numeric. Size of the moving window. See \code{\link{rollapply}}.
#' @param calib Character. Function to model the DI+LPD~performance relationship. Currently lm and scam are supported
#' @param length.out Numeric. Only used if multiCV=TRUE. Number of cross-validation folds. See details.
#' @param method Character. Method used for distance calculation. Currently euclidean distance (L2) and Mahalanobis distance (MD) are implemented but only L2 is tested. Note that MD takes considerably longer. See ?aoa for further explanation
#' @param useWeight Logical. Only if a model is given. Weight variables according to importance in the model?
#' @param k Numeric. See mgcv::s
#' @param m Numeric. See mgcv::s
#' @details If multiCV=TRUE the model is re-fitted and validated by length.out new cross-validations where the cross-validation folds are defined by clusters in the predictor space,
#' ranging from three clusters to LOOCV. Hence, a large range of DI values is created during cross-validation.
#' If the AOA threshold based on the calibration data from multiple CV is larger than the original AOA threshold (which is likely if extrapolation situations are created during CV),
#' the AOA threshold changes accordingly. See Meyer and Pebesma (2021) for the full documentation of the methodology.
#' @return A scam or linear model
#' @author
#' Hanna Meyer, Marvin Ludwig
#' @references Meyer, H., Pebesma, E. (2021): Predicting into unknown space?
#' Estimating the area of applicability of spatial prediction models.
#' \doi{10.1111/2041-210X.13650}
#' @seealso \code{\link{aoa}}
#' @example inst/examples/ex_DI_LPDtoErrormetric.R
#'
#'
#' @export
#'

DI_LPDtoErrormetric <- function(model, trainDI, multiCV=FALSE,
length.out = 10, window.size = 5, calib = "scam",
method= "L2", useWeight=TRUE,
kDI = 6, mDI = 2, kLPD = 4, mLPD = 2){


if(inherits(trainDI,"aoa")){
trainDI = trainDI$parameters
}


# get DIs and Errormetrics OR calculate new ones from multiCV
if(!multiCV){
preds_all <- get_preds_all_DI_LPD(model, trainDI)
}
if(multiCV){
preds_all <- multiCV_DI_LPD(model, length.out, method, useWeight)
}

# train model between DI and Errormetric
error_model = errorModel_DI_LPD(preds_all, model, window.size, calib, kDI, mDI, kLPD, mLPD)

# save AOA threshold and raw data
attr(error_model, "AOA_threshold") <- attr(preds_all, "AOA_threshold")
class(error_model) <- c("errorModelDI_LPD", class(error_model))
return(error_model)
}






#' Model expected error between Metric and DI and LPD
#' @param preds_all data.frame: pred, obs, DI, LPD
#' @param model the model used to get the AOA
#' @param window.size Numeric. Size of the moving window. See \code{\link{rollapply}}.
#' @param calib Character. Function to model the DI~performance relationship. Currently lm and scam are supported
#' @param k Numeric. See mgcv::s
#' @param m Numeric. See mgcv::s
#' @return scam or lm
#'

errorModel_DI_LPD <- function(preds_all, model, window.size, calib, kDI, mDI, kLPD, mLPD){

## use performance metric from the model:
rmse <- function(pred,obs){sqrt( mean((pred - obs)^2, na.rm = TRUE) )}
rsquared <- function(pred,obs){summary(lm(pred~obs))$r.squared}
mae <- function(pred,obs){MAE(pred,obs)}
kappa <- function(pred,obs){
pred <- factor(pred)
obs <- factor(obs)
lev <- unique(c(levels(pred), levels(obs)))
pred <- factor(pred, levels = lev)
obs <- factor(obs, levels = lev)
result <- tryCatch( confusionMatrix(pred, obs)$overall["Kappa"], error = function(e)e)
if(inherits(result, "error")){result <- 0} # 0 not right value!!! adjust!!!
return(unname(result))
}

accuracy <- function(pred,obs){
pred <- factor(pred)
obs <- factor(obs)
lev <- unique(c(levels(pred), levels(obs)))
pred <- factor(pred, levels = lev)
obs <- factor(obs, levels = lev)
result <- tryCatch(confusionMatrix(pred, obs)$overall["Accuracy"], error = function(e)e)
if(inherits(result, "error")){result <- 0}
return(unname(result))
}
if(!tolower(model$metric)%in%c("rmse","rsquared","mae","kappa","accuracy")){
message("Model metric not yet included in this function")
stop()
}

evalfunc <- function(pred,obs){
eval(parse(text=paste0(tolower(model$metric),"(pred,obs)")))
}

# performance$metric <- abs(performance$pred - performance$obs)
# order data according to DI:
performanceDI <- preds_all[order(preds_all$DI),]
# calculate performance for moving window:
performanceDI$metric <- zoo::rollapply(performanceDI[,1:2], window.size,
FUN=function(x){evalfunc(x[,1],x[,2])},
by.column=F,align = "center",fill=NA)
performanceDI$ll <- data.table::shift(performanceDI$DI,window.size/2)
performanceDI$ul <- data.table::shift(performanceDI$DI,-round(window.size/2),0)
performanceDI <- performanceDI[!is.na(performanceDI$metric),]
performanceDI$ID <- as.numeric(rownames(performanceDI))

# order data according to LPD:
performanceLPD <- preds_all[order(preds_all$LPD),]
# calculate performance for moving window:
performanceLPD$metric <- zoo::rollapply(performanceLPD[,1:2], window.size,
FUN=function(x){evalfunc(x[,1],x[,2])},
by.column=F,align = "center",fill=NA)
performanceLPD$ll <- data.table::shift(performanceLPD$LPD,window.size/2)
performanceLPD$ul <- data.table::shift(performanceLPD$LPD,-round(window.size/2),0)
performanceLPD <- performanceLPD[!is.na(performanceLPD$metric),]
performanceLPD$ID <- as.numeric(rownames(performanceLPD))

performance <- merge(performanceDI, performanceLPD, by = c("ID", "DI", "LPD", "pred", "obs"))
performance$metric <- (performance$metric.x + performance$metric.y) / 2
names(performance) <- c("ID", "DI", "LPD", "pred", "obs", "metric.DI", "ll.DI", "ul.DI", "metric.LPD", "ll.LPD", "ul.LPD", "metric")

### Estimate Error:
if(calib=="lm"){
errormodel <- lm(metric ~ DI+ LPD, data = performance)
}
if(calib=="scam"){
if (!requireNamespace("scam", quietly = TRUE)) {
stop("Package \"scam\" needed for this function to work. Please install it.",
call. = FALSE)
}
if (model$maximize){ # e.g. accuracy, kappa, r2
bsDI="mpd"
}else{
bsDI="mpi" #e.g. RMSE
}

if (model$maximize){ # e.g. accuracy, kappa, r2
bsLPD="mpi"
}else{
bsLPD="mpd" #e.g. RMSE
}

errormodel <- scam::scam(metric~s(DI, k=kDI, bs=bsDI, m=mDI)+s(LPD, k=kLPD, bs=bsLPD, m=mLPD),
data=performance,
family=stats::gaussian(link="identity"))
}
if(calib=="exp"){
errormodel <- lm(metric ~ DI + log(LPD), data = performance)
}

attr(errormodel, "performance") = performance

return(errormodel)
}


#' MultiCV
#' @description
#' Multiple Cross-Validation with increasing feature space clusteres
#' @param model the model used to get the AOA
#' @param length.out Numeric. Only used if multiCV=TRUE. Number of cross-validation folds. See details.
#' @param method Character. Method used for distance calculation. Currently euclidean distance (L2) and Mahalanobis distance (MD) are implemented but only L2 is tested. Note that MD takes considerably longer. See ?aoa for further explanation
#' @param useWeight Logical. Only if a model is given. Weight variables according to importance in the model?
#' @param ... additional parameters to trainDI
#' @returns preds_all
#'
#'

multiCV_DI_LPD <- function(model, length.out, method, useWeight,...){

preds_all <- data.frame()
train_predictors <- model$trainingData[,-which(names(model$trainingData)==".outcome")]
train_response <- model$trainingData$.outcome

for (nclst in round(seq(3,nrow(train_predictors), length.out = length.out))){
# define clusters in predictor space used for CV:
clstrID <- tryCatch({stats::kmeans(train_predictors,nclst)$cluster},
error=function(e)e)
if(inherits(clstrID,"error")){next}
clstrID <- clstrID
folds <- CreateSpacetimeFolds(data.frame("clstrID"=clstrID), spacevar="clstrID",k=nclst)

# update model call with new CV strategy:
mcall <- as.list(model$call)
mcall <- mcall[-which(names(mcall)%in%c("form","data","","x","y","","trControl"))]
mcall$x <- quote(train_predictors)
mcall$y <- quote(train_response)
mcall$trControl <- trainControl(method="cv",index=folds$index,savePredictions = TRUE)
mcall$tuneGrid <- model$bestTune
mcall$method <- model$method
mcall$metric <- model$metric
mcall$cl <- NULL # fix option for parallel later

# retrain model and calculate AOA
model_new <- do.call(caret::train,mcall)
trainDI_new <- trainDI(model_new, method=method, useWeight=useWeight, LPD = TRUE)


# get cross-validated predictions, order them and use only those located in the AOA
preds <- model_new$pred
preds <- preds[order(preds$rowIndex),c("pred","obs")]
preds_dat_tmp <- data.frame(preds,"DI"=trainDI_new$trainDI, "LPD"=trainDI_new$trainLPD)
preds_dat_tmp <- preds_dat_tmp[preds_dat_tmp$DI <= trainDI_new$threshold,]
preds_all <- rbind(preds_all,preds_dat_tmp)
}

attr(preds_all, "AOA_threshold") <- trainDI_new$threshold
attr(preds_all, "avrgLPD") <- trainDI_new$avrgLPD
message(paste0("Note: multiCV=TRUE calculated new AOA threshold of ", round(trainDI_new$threshold, 5), "and new average LPD of ", trainDI_new$avrgLPD,
"\nThreshold is stored in the attributes, access with attr(error_model, 'AOA_threshold').",
"\nAverage LPD is stored in the attributes, access with attr(error_model, 'avrgLPD').",
"\nPlease refere to examples and details for further information."))
return(preds_all)

}


#' Get Preds all
#' @param model, a model
#' @param trainDI, a trainDI
#'

get_preds_all_DI_LPD <- function(model, trainDI){

if(is.null(model$pred)){
stop("no cross-predictions can be retrieved from the model. Train with savePredictions=TRUE or provide calibration data")
}

## extract cv predictions from model
preds_all <- model$pred
for (i in 1:length(model$bestTune)){
tunevar <- names(model$bestTune[i])
preds_all <- preds_all[preds_all[,tunevar]==model$bestTune[,tunevar],]
}
preds_all <- preds_all[order(preds_all$rowIndex),c("pred","obs")]


## add DI from trainDI and LPD from trainLPD
preds_all$DI <- trainDI$trainDI[!is.na(trainDI$trainDI)]
preds_all$LPD <- trainDI$trainLPD[!is.na(trainDI$trainLPD)]
## only take predictions from inside the AOA:
preds_all <- preds_all[preds_all$DI<=trainDI$threshold,]
# preds_all <- preds_all[preds_all$LPD>0,]
attr(preds_all, "AOA_threshold") <- trainDI$threshold
attr(preds_all, "avrgLPD") <- trainDI$avrgLPD

return(preds_all)

}

15 changes: 8 additions & 7 deletions R/DItoErrormetric.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,18 @@ DItoErrormetric <- function(model, trainDI, multiCV=FALSE,

# get DIs and Errormetrics OR calculate new ones from multiCV
if(!multiCV){
preds_all <- get_preds_all(model, trainDI)
preds_all <- get_preds_all_DI(model, trainDI)
}
if(multiCV){
preds_all <- multiCV(model, length.out, method, useWeight)
preds_all <- multiCV_DI(model, length.out, method, useWeight)
}

# train model between DI and Errormetric
error_model = errorModel(preds_all, model, window.size, calib, k, m)
error_model = errorModel_DI(preds_all, model, window.size, calib, k, m)

# save AOA threshold and raw data
attr(error_model, "AOA_threshold") <- attr(preds_all, "AOA_threshold")
class(error_model) <- c("errorModel", class(error_model))
class(error_model) <- c("errorModelDI", class(error_model))
return(error_model)
}

Expand All @@ -71,7 +71,7 @@ DItoErrormetric <- function(model, trainDI, multiCV=FALSE,
#' @return scam or lm
#'

errorModel <- function(preds_all, model, window.size, calib, k, m){
errorModel_DI <- function(preds_all, model, window.size, calib, k, m){

## use performance metric from the model:
rmse <- function(pred,obs){sqrt( mean((pred - obs)^2, na.rm = TRUE) )}
Expand Down Expand Up @@ -137,6 +137,7 @@ errorModel <- function(preds_all, model, window.size, calib, k, m){
data=performance,
family=stats::gaussian(link="identity"))
}

attr(errormodel, "performance") = performance

return(errormodel)
Expand All @@ -155,7 +156,7 @@ errorModel <- function(preds_all, model, window.size, calib, k, m){
#'
#'

multiCV <- function(model, length.out, method, useWeight,...){
multiCV_DI <- function(model, length.out, method, useWeight,...){

preds_all <- data.frame()
train_predictors <- model$trainingData[,-which(names(model$trainingData)==".outcome")]
Expand Down Expand Up @@ -207,7 +208,7 @@ multiCV <- function(model, length.out, method, useWeight,...){
#' @param trainDI, a trainDI
#'

get_preds_all <- function(model, trainDI){
get_preds_all_DI <- function(model, trainDI){

if(is.null(model$pred)){
stop("no cross-predictions can be retrieved from the model. Train with savePredictions=TRUE or provide calibration data")
Expand Down
Loading