Skip to content

Commit

Permalink
Merge pull request #8 from certara/drying_it_off
Browse files Browse the repository at this point in the history
Drying it off
  • Loading branch information
certara-jcraig authored Dec 13, 2023
2 parents 462fbfc + 4d23863 commit 9254cfc
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 188 deletions.
164 changes: 32 additions & 132 deletions R/cov_search.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,42 +14,23 @@
MLCovSearch <- function(tab, list_pop_param, cov_continuous, cov_factors, seed = 123) {

# Check that covariates supplied by user exist in the data
errors <- data_validation(tab, list_pop_param, cov_continuous, cov_factors)
if (length(errors) > 0) {
stop(paste0(errors, sep = "\n"), call. = FALSE)
}

data_validation(tab, list_pop_param, cov_continuous, cov_factors)

# Set seed
stopifnot(is.numeric(seed))
set.seed(seed)

stopifnot(requireNamespace("caret", quietly = TRUE))
# Selection of columns required
tab <- tab %>%
dplyr::select(ID, dplyr::all_of(list_pop_param), dplyr::all_of(cov_continuous), dplyr::all_of(cov_factors))

# In order to have the individual parameter and one point per subject
dat <- unique(tab) %>%
dplyr::mutate(dplyr::across(dplyr::all_of(cov_factors), as.factor))

# Data for XGBoost
pop_param <- dat %>% dplyr::select(dplyr::all_of(list_pop_param))
factors <- dat %>% dplyr::select(dplyr::all_of(cov_factors))
continuous <- dat %>% dplyr::select(dplyr::all_of(cov_continuous))

# Select columns and generate data for XGBoost
data <- col_select(tab, list_pop_param, cov_continuous, cov_factors)
pop_param <- data %>% dplyr::select(dplyr::all_of(list_pop_param))
factors <- data %>% dplyr::select(dplyr::all_of(cov_factors))
continuous <- data %>% dplyr::select(dplyr::all_of(cov_continuous))

# One-hot encoding of categorical covariates for covariates with more than 2 levels
modified_columns <- data.frame(matrix(ncol = 0, nrow = nrow(factors)))
for (col in names(factors)) {
if (is.factor(factors[[col]]) && nlevels(factors[[col]]) > 2) {
dmy <- caret::dummyVars(paste0("~", col), data = factors)
encoded <- data.frame(predict(dmy, newdata = factors))
modified_columns <- cbind(modified_columns,encoded)
} else {
modified_columns[[col]] <- as.numeric(as.character(factors[[col]]))
}
}

dat_XGB <- cbind(pop_param, modified_columns, continuous)

dat_XGB <- generate_dat_XGB(pop_param, factors, continuous)

full_covariate_xgm <- names(dat_XGB)
full_covariate_xgm <- setdiff(full_covariate_xgm, list_pop_param)

Expand Down Expand Up @@ -176,14 +157,8 @@ MLCovSearch <- function(tab, list_pop_param, cov_continuous, cov_factors, seed =


if (length(list_cov[[1]]) != 0 ) {
xgb.mod <- xgboost::xgboost(
data = training,
label = y.xgm_train,
nrounds = 200,
objective = "reg:squarederror",
verbose = 0
)

xgb.mod <- generate_xgb.mod(data = training, label = y.xgm_train)

# predict on the test set with the new model
y.xgb.pred <- predict(xgb.mod, newdata = testing)
# evaluate the performance of the model
Expand Down Expand Up @@ -219,13 +194,7 @@ MLCovSearch <- function(tab, list_pop_param, cov_continuous, cov_factors, seed =
as.matrix(dat_XGB %>% dplyr::select(dplyr::all_of(list_cov[[1]])))

if (length(list_cov[[1]]) != 0) {
xgb.mod_final <- xgboost::xgboost(
data = x.selected_final,
label = y_xgb,
nrounds = 200,
objective = "reg:squarederror",
verbose = 0
)
xgb.mod_final <- generate_xgb.mod(data = x.selected_final, label = y_xgb)

# Generate SHAP summary plot for the current parameter
shap_values <- SHAPforxgboost::shap.values(xgb_model = xgb.mod_final, X_train = x.selected_final)
Expand Down Expand Up @@ -253,30 +222,6 @@ MLCovSearch <- function(tab, list_pop_param, cov_continuous, cov_factors, seed =
)
}

data_validation <- function(tab, list_pop_param, cov_continuous, cov_factors) {
errors <- c()

for (i in 1:3) {
vectors <- list(list_pop_param, cov_continuous, cov_factors)
vector_names <- c("list_pop_param", "cov_continuous", "cov_factors")

missing_values <- setdiff(vectors[[i]], colnames(tab))

if (length(missing_values) > 0) {
error_message <-
paste( "The following values from", vector_names[[i]], "are missing in the dataset:", toString(missing_values))

errors <- c(errors, error_message)
}
}
return(errors)

}







#' Generate Residual Plots for Model Analysis
Expand Down Expand Up @@ -308,43 +253,23 @@ data_validation <- function(tab, list_pop_param, cov_continuous, cov_factors) {
generate_residualsplots <- function(tab, list_pop_param, cov_continuous, cov_factors, result_ML, result_5folds, i, seed = 123 ) {

# Check that covariates supplied by user exist in the data
errors <- data_validation(tab, list_pop_param, cov_continuous, cov_factors)
if (length(errors) > 0) {
stop(paste0(errors, sep = "\n"), call. = FALSE)
}
data_validation(tab, list_pop_param, cov_continuous, cov_factors)

stopifnot(is.numeric(seed))
set.seed(seed)
# Set seed
if (!is.null(seed)) {
stopifnot(is.numeric(seed))
set.seed(seed)
}

# Selection of columns required
tab <- tab %>%
dplyr::select(ID, dplyr::all_of(list_pop_param), dplyr::all_of(cov_continuous), dplyr::all_of(cov_factors))



# In order to have the individual parameter and one point per subject
dat <- unique(tab) %>%
dplyr::mutate(dplyr::across(dplyr::all_of(cov_factors), as.factor))

# Data for XGBoost
# Select columns and generate data for XGBoost
dat <- col_select(tab, list_pop_param, cov_continuous, cov_factors)
pop_param <- dat %>% dplyr::select(dplyr::all_of(list_pop_param))
factors <- dat %>% dplyr::select(dplyr::all_of(cov_factors))
continuous <- dat %>% dplyr::select(dplyr::all_of(cov_continuous))

# One-hot encoding of categorical covariates for covariates with more than 2 levels
modified_columns <- data.frame(matrix(ncol = 0, nrow = nrow(factors)))
for (col in names(factors)) {
if (is.factor(factors[[col]]) && nlevels(factors[[col]]) > 2) {
dmy <- caret::dummyVars(paste0("~", col), data = factors)
encoded <- data.frame(predict(dmy, newdata = factors))
modified_columns <- cbind(modified_columns,encoded)
} else {
modified_columns[[col]] <- as.numeric(as.character(factors[[col]]))
}
}

dat_XGB <- cbind(pop_param, modified_columns, continuous)

dat_XGB <- generate_dat_XGB(pop_param, factors, continuous)

full_covariate_xgm <- names(dat_XGB)
full_covariate_xgm <- setdiff(full_covariate_xgm, list_pop_param)

Expand All @@ -354,7 +279,6 @@ generate_residualsplots <- function(tab, list_pop_param, cov_continuous, cov_fac
res <- res %>% dplyr::na_if("")

result_ML <- as.matrix(result_ML) %>% dplyr::na_if("")
#result_ML[result_ML == ""] <- NA

# Assign the independent and dependent covariates
x_xgb <- data.matrix(dat_XGB[, c(full_covariate_xgm)])
Expand All @@ -380,14 +304,8 @@ generate_residualsplots <- function(tab, list_pop_param, cov_continuous, cov_fac
y.xgm_test <- y[-train.ind, ]

if (length(list_cov[[1]]) != 0 ) {
xgb.mod <- xgboost::xgboost(
data = training,
label = y.xgm_train,
nrounds = 200,
objective = "reg:squarederror",
verbose = 0
)

xgb.mod <- generate_xgb.mod(data = training, label = y.xgm_train)

# predict on the test set with the new model
y.xgb.pred <- predict(xgb.mod, newdata = testing)

Expand Down Expand Up @@ -425,14 +343,8 @@ generate_residualsplots <- function(tab, list_pop_param, cov_continuous, cov_fac
y.xgm_train <- y[train.ind, ]
y.xgm_test <- y[-train.ind, ]

xgb.mod <- xgboost::xgboost(
data = training,
label = y.xgm_train,
nrounds = 200,
objective = "reg:squarederror",
verbose = 0
)

xgb.mod <- generate_xgb.mod(data = training, label = y.xgm_train)

y.xgb.pred <- predict(xgb.mod, newdata = testing)
residuals <- y.xgb.pred - y.xgm_test

Expand Down Expand Up @@ -484,14 +396,8 @@ generate_residualsplots <- function(tab, list_pop_param, cov_continuous, cov_fac
y.xgm_test <- y[-train.ind, ]

if (length(list_cov_nb) != 0 ) {
xgb.mod <- xgboost::xgboost(
data = training,
label = y.xgm_train,
nrounds = 200,
objective = "reg:squarederror",
verbose = 0
)

xgb.mod <- generate_xgb.mod(data = training, label = y.xgm_train)

# predict on the test set with the new model
y.xgb.pred <- predict(xgb.mod, newdata = testing)

Expand Down Expand Up @@ -530,14 +436,8 @@ generate_residualsplots <- function(tab, list_pop_param, cov_continuous, cov_fac
y.xgm_train <- y[train.ind, ]
y.xgm_test <- y[-train.ind, ]

xgb.mod <- xgboost::xgboost(
data = training,
label = y.xgm_train,
nrounds = 200,
objective = "reg:squarederror",
verbose = 0
)

xgb.mod <- generate_xgb.mod(data = training, label = y.xgm_train)

y.xgb.pred <- predict(xgb.mod, newdata = testing)
residuals <- y.xgb.pred - y.xgm_test

Expand Down
67 changes: 11 additions & 56 deletions R/generate_residualsplots2.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,38 +37,17 @@ generate_residualsplots2 <- function(data, result, i, seed = NULL) {
result_5folds <- result$result_5folds

# Check that covariates supplied by user exist in the data
errors <- data_validation(data, list_pop_param, cov_continuous, cov_factors)
if (length(errors) > 0) {
stop(paste0(errors, sep = "\n"), call. = FALSE)
}

# Selection of columns required
data <- data %>%
dplyr::select(ID, dplyr::all_of(list_pop_param), dplyr::all_of(cov_continuous), dplyr::all_of(cov_factors))
data_validation(data, list_pop_param, cov_continuous, cov_factors)

# In order to have the individual parameter and one point per subject
dat <- unique(data) %>%
dplyr::mutate(across(dplyr::all_of(cov_factors), as.factor))

# Data for XGBoost
# Select columns and generate data for XGBoost
dat <- col_select(data, list_pop_param, cov_continuous, cov_factors)
pop_param <- dat %>% dplyr::select(dplyr::all_of(list_pop_param))
factors <- dat %>% dplyr::select(dplyr::all_of(cov_factors))
continuous <- dat %>% dplyr::select(dplyr::all_of(cov_continuous))

# One-hot encoding of categorical covariates for covariates with more than 2 levels
modified_columns <- data.frame(matrix(ncol = 0, nrow = nrow(factors)))
for (col in names(factors)) {
if (is.factor(factors[[col]]) && nlevels(factors[[col]]) > 2) {
dmy <- caret::dummyVars(paste0("~", col), data = factors)
encoded <- data.frame(predict(dmy, newdata = factors))
modified_columns <- cbind(modified_columns,encoded)
} else {
modified_columns[[col]] <- as.numeric(as.character(factors[[col]]))
}
}

dat_XGB <- cbind(pop_param, modified_columns, continuous)

dat_XGB <- generate_dat_XGB(pop_param, factors, continuous)

full_covariate_xgm <- names(dat_XGB)
full_covariate_xgm <- setdiff(full_covariate_xgm, list_pop_param)

Expand All @@ -85,7 +64,7 @@ generate_residualsplots2 <- function(data, result, i, seed = NULL) {
plots_list <- list() # Initialize the list to store plots

dat <- dat %>%
dplyr::mutate(across(dplyr::all_of(cov_factors), as.numeric))
dplyr::mutate(dplyr::across(dplyr::all_of(cov_factors), as.numeric))

# First case: covariates are selected after the vote
if (is.na(result_ML[i, 1]) == F) {
Expand All @@ -104,13 +83,7 @@ generate_residualsplots2 <- function(data, result, i, seed = NULL) {
y.xgm_test <- y[-train.ind, ]

if (length(list_cov[[1]]) != 0 ) {
xgb.mod <- xgboost::xgboost(
data = training,
label = y.xgm_train,
nrounds = 200,
objective = "reg:squarederror",
verbose = 0
)
xgb.mod <- generate_xgb.mod(data = training, label = y.xgm_train)

# predict on the test set with the new model
y.xgb.pred <- predict(xgb.mod, newdata = testing)
Expand Down Expand Up @@ -157,14 +130,8 @@ generate_residualsplots2 <- function(data, result, i, seed = NULL) {
y.xgm_train <- y[train.ind, ]
y.xgm_test <- y[-train.ind, ]

xgb.mod <- xgboost::xgboost(
data = training,
label = y.xgm_train,
nrounds = 200,
objective = "reg:squarederror",
verbose = 0
)

xgb.mod <- generate_xgb.mod(data = training, label = y.xgm_train)

y.xgb.pred <- predict(xgb.mod, newdata = testing)
residuals <- y.xgb.pred - y.xgm_test

Expand Down Expand Up @@ -238,13 +205,7 @@ generate_residualsplots2 <- function(data, result, i, seed = NULL) {
y.xgm_test <- y[-train.ind, ]

if (length(list_cov_nb) != 0 ) {
xgb.mod <- xgboost::xgboost(
data = training,
label = y.xgm_train,
nrounds = 200,
objective = "reg:squarederror",
verbose = 0
)
xgb.mod <- generate_xgb.mod(data = training, label = y.xgm_train)

# predict on the test set with the new model
y.xgb.pred <- predict(xgb.mod, newdata = testing)
Expand Down Expand Up @@ -286,13 +247,7 @@ generate_residualsplots2 <- function(data, result, i, seed = NULL) {
y.xgm_train <- y[train.ind, ]
y.xgm_test <- y[-train.ind, ]

xgb.mod <- xgboost::xgboost(
data = training,
label = y.xgm_train,
nrounds = 200,
objective = "reg:squarederror",
verbose = 0
)
xgb.mod <- generate_xgb.mod(data = training, label = y.xgm_train)

y.xgb.pred <- predict(xgb.mod, newdata = testing)
residuals <- y.xgb.pred - y.xgm_test
Expand Down
Loading

0 comments on commit 9254cfc

Please sign in to comment.