Skip to content

Commit

Permalink
[R] Use new predict function. (#6819)
Browse files Browse the repository at this point in the history
* Call new C prediction API.
* Add `strict_shape`.
* Add `iterationrange`.
* Update document.
  • Loading branch information
trivialfis authored Jun 11, 2021
1 parent 25514e1 commit b56614e
Show file tree
Hide file tree
Showing 18 changed files with 290 additions and 157 deletions.
10 changes: 3 additions & 7 deletions R-package/R/callbacks.R
Original file line number Diff line number Diff line change
Expand Up @@ -263,10 +263,7 @@ cb.reset.parameters <- function(new_params) {
#' \itemize{
#' \item \code{best_score} the evaluation score at the best iteration
#' \item \code{best_iteration} at which boosting iteration the best score has occurred (1-based index)
#' \item \code{best_ntreelimit} to use with the \code{ntreelimit} parameter in \code{predict}.
#' It differs from \code{best_iteration} in multiclass or random forest settings.
#' }
#'
#' The Same values are also stored as xgb-attributes:
#' \itemize{
#' \item \code{best_iteration} is stored as a 0-based iteration index (for interoperability of binary models)
Expand Down Expand Up @@ -498,13 +495,12 @@ cb.cv.predict <- function(save_models = FALSE) {
rep(NA_real_, N)
}

ntreelimit <- NVL(env$basket$best_ntreelimit,
env$end_iteration * env$num_parallel_tree)
iterationrange <- c(1, NVL(env$basket$best_iteration, env$end_iteration) + 1)
if (NVL(env$params[['booster']], '') == 'gblinear') {
ntreelimit <- 0 # must be 0 for gblinear
iterationrange <- c(1, 1) # must be 0 for gblinear
}
for (fd in env$bst_folds) {
pr <- predict(fd$bst, fd$watchlist[[2]], ntreelimit = ntreelimit, reshape = TRUE)
pr <- predict(fd$bst, fd$watchlist[[2]], iterationrange = iterationrange, reshape = TRUE)
if (is.matrix(pred)) {
pred[fd$index, ] <- pr
} else {
Expand Down
3 changes: 2 additions & 1 deletion R-package/R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,8 @@ xgb.iter.eval <- function(booster_handle, watchlist, iter, feval = NULL) {
} else {
res <- sapply(seq_along(watchlist), function(j) {
w <- watchlist[[j]]
preds <- predict(booster_handle, w, outputmargin = TRUE, ntreelimit = 0) # predict using all trees
## predict using all trees
preds <- predict(booster_handle, w, outputmargin = TRUE, iterationrange = c(1, 1))
eval_res <- feval(preds, w)
out <- eval_res$value
names(out) <- paste0(evnames[j], "-", eval_res$metric)
Expand Down
212 changes: 119 additions & 93 deletions R-package/R/xgb.Booster.R
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,7 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
#' @param outputmargin whether the prediction should be returned in the for of original untransformed
#' sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for
#' logistic regression would result in predictions for log-odds instead of probabilities.
#' @param ntreelimit limit the number of model's trees or boosting iterations used in prediction (see Details).
#' It will use all the trees by default (\code{NULL} value).
#' @param ntreelimit Deprecated, use \code{iterationrange} instead.
#' @param predleaf whether predict leaf index.
#' @param predcontrib whether to return feature contributions to individual predictions (see Details).
#' @param approxcontrib whether to use a fast approximation for feature contributions (see Details).
Expand All @@ -179,16 +178,19 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
#' or predinteraction flags is TRUE.
#' @param training whether is the prediction result used for training. For dart booster,
#' training predicting will perform dropout.
#' @param iterationrange Specifies which layer of trees are used in prediction. For
#' example, if a random forest is trained with 100 rounds. Specifying
#' `iteration_range=(1, 21)`, then only the forests built during [1, 21) (half open set)
#' rounds are used in this prediction. It's 1-based index just like R vector. When set
#' to \code{c(1, 1)} XGBoost will use all trees.
#' @param strict_shape Default is \code{FALSE}. When it's set to \code{TRUE}, output
#' type and shape of prediction are invariant to model type.
#'
#' @param ... Parameters passed to \code{predict.xgb.Booster}
#'
#' @details
#' Note that \code{ntreelimit} is not necessarily equal to the number of boosting iterations
#' and it is not necessarily equal to the number of trees in a model.
#' E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees.
#' But for multiclass classification, while there are multiple trees per iteration,
#' \code{ntreelimit} limits the number of boosting iterations.
#'
#' Also note that \code{ntreelimit} would currently do nothing for predictions from gblinear,
#' Note that \code{iterationrange} would currently do nothing for predictions from gblinear,
#' since gblinear doesn't keep its boosting history.
#'
#' One possible practical applications of the \code{predleaf} option is to use the model
Expand All @@ -209,7 +211,8 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
#' of the most important features first. See below about the format of the returned results.
#'
#' @return
#' For regression or binary classification, it returns a vector of length \code{nrows(newdata)}.
#' The return type is different depending whether \code{strict_shape} is set to \code{TRUE}. By default,
#' for regression or binary classification, it returns a vector of length \code{nrows(newdata)}.
#' For multiclass classification, either a \code{num_class * nrows(newdata)} vector or
#' a \code{(nrows(newdata), num_class)} dimension matrix is returned, depending on
#' the \code{reshape} value.
Expand All @@ -231,6 +234,13 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
#' For a multiclass case, a list of \code{num_class} elements is returned, where each element is
#' such an array.
#'
#' When \code{strict_shape} is set to \code{TRUE}, the output is always an array. For
#' normal prediction, the output is a 2-dimension array \code{(num_class, nrow(newdata))}.
#'
#' For \code{predcontrib = TRUE}, output is \code{(ncol(newdata) + 1, num_class, nrow(newdata))}
#' For \code{predinteraction = TRUE}, output is \code{(ncol(newdata) + 1, ncol(newdata) + 1, num_class, nrow(newdata))}
#' For \code{predleaf = TRUE}, output is \code{(n_trees_in_forest, num_class, n_iterations, nrow(newdata))}
#'
#' @seealso
#' \code{\link{xgb.train}}.
#'
Expand All @@ -253,7 +263,7 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
#' # use all trees by default
#' pred <- predict(bst, test$data)
#' # use only the 1st tree
#' pred1 <- predict(bst, test$data, ntreelimit = 1)
#' pred1 <- predict(bst, test$data, iterationrange = c(1, 2))
#'
#' # Predicting tree leafs:
#' # the result is an nsamples X ntrees matrix
Expand Down Expand Up @@ -305,113 +315,129 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
#' all.equal(pred, pred_labels)
#' # prediction from using only 5 iterations should result
#' # in the same error as seen in iteration 5:
#' pred5 <- predict(bst, as.matrix(iris[, -5]), ntreelimit=5)
#' pred5 <- predict(bst, as.matrix(iris[, -5]), iterationrange=c(1, 6))
#' sum(pred5 != lb)/length(lb)
#'
#'
#' ## random forest-like model of 25 trees for binary classification:
#'
#' set.seed(11)
#' bst <- xgboost(data = train$data, label = train$label, max_depth = 5,
#' nthread = 2, nrounds = 1, objective = "binary:logistic",
#' num_parallel_tree = 25, subsample = 0.6, colsample_bytree = 0.1)
#' # Inspect the prediction error vs number of trees:
#' lb <- test$label
#' dtest <- xgb.DMatrix(test$data, label=lb)
#' err <- sapply(1:25, function(n) {
#' pred <- predict(bst, dtest, ntreelimit=n)
#' sum((pred > 0.5) != lb)/length(lb)
#' })
#' plot(err, type='l', ylim=c(0,0.1), xlab='#trees')
#'
#' @rdname predict.xgb.Booster
#' @export
predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL,
predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
reshape = FALSE, training = FALSE, ...) {

reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE, ...) {
object <- xgb.Booster.complete(object, saveraw = FALSE)
if (!inherits(newdata, "xgb.DMatrix"))
newdata <- xgb.DMatrix(newdata, missing = missing)
if (!is.null(object[["feature_names"]]) &&
!is.null(colnames(newdata)) &&
!identical(object[["feature_names"]], colnames(newdata)))
stop("Feature names stored in `object` and `newdata` are different!")
if (is.null(ntreelimit))
ntreelimit <- NVL(object$best_ntreelimit, 0)
if (NVL(object$params[['booster']], '') == 'gblinear')

if (NVL(object$params[['booster']], '') == 'gblinear' || is.null(ntreelimit))
ntreelimit <- 0
if (ntreelimit < 0)
stop("ntreelimit cannot be negative")

option <- 0L + 1L * as.logical(outputmargin) + 2L * as.logical(predleaf) + 4L * as.logical(predcontrib) +
8L * as.logical(approxcontrib) + 16L * as.logical(predinteraction)
if (ntreelimit != 0 && is.null(iterationrange)) {
## only ntreelimit, initialize iteration range
iterationrange <- c(0, 0)
} else if (ntreelimit == 0 && !is.null(iterationrange)) {
## only iteration range, handle 1-based indexing
iterationrange <- c(iterationrange[1] - 1, iterationrange[2] - 1)
} else if (ntreelimit != 0 && !is.null(iterationrange)) {
## both are specified, let libgxgboost throw an error
} else {
## no limit is supplied, use best
if (is.null(object$best_iteration)) {
iterationrange <- c(0, 0)
} else {
## We don't need to + 1 as R is 1-based index.
iterationrange <- c(0, as.integer(object$best_iteration))
}
}
## Handle the 0 length values.
box <- function(val) {
if (length(val) == 0) {
cval <- vector(, 1)
cval[0] <- val
return(cval)
}
return (val)
}

## We set strict_shape to TRUE then drop the dimensions conditionally
args <- list(
training = box(training),
strict_shape = box(TRUE),
iteration_begin = box(as.integer(iterationrange[1])),
iteration_end = box(as.integer(iterationrange[2])),
ntree_limit = box(as.integer(ntreelimit)),
type = box(as.integer(0))
)

set_type <- function(type) {
if (args$type != 0) {
stop("One type of prediction at a time.")
}
return(box(as.integer(type)))
}
if (outputmargin) {
args$type <- set_type(1)
}
if (predcontrib) {
args$type <- set_type(if (approxcontrib) 3 else 2)
}
if (predinteraction) {
args$type <- set_type(if (approxcontrib) 5 else 4)
}
if (predleaf) {
args$type <- set_type(6)
}

ret <- .Call(XGBoosterPredict_R, object$handle, newdata, option[1],
as.integer(ntreelimit), as.integer(training))
predts <- .Call(
XGBoosterPredictFromDMatrix_R, object$handle, newdata, jsonlite::toJSON(args, auto_unbox = TRUE)
)
names(predts) <- c("shape", "results")
shape <- predts$shape
ret <- predts$results

n_ret <- length(ret)
n_row <- nrow(newdata)
npred_per_case <- n_ret / n_row
if (n_row != shape[1]) {
stop("Incorrect predict shape.")
}

if (n_ret %% n_row != 0)
stop("prediction length ", n_ret, " is not multiple of nrows(newdata) ", n_row)
arr <- array(data = ret, dim = rev(shape))

if (predleaf) {
ret <- if (n_ret == n_row) {
matrix(ret, ncol = 1)
} else {
matrix(ret, nrow = n_row, byrow = TRUE)
}
} else if (predcontrib) {
n_col1 <- ncol(newdata) + 1
n_group <- npred_per_case / n_col1
cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
ret <- if (n_ret == n_row) {
matrix(ret, ncol = 1, dimnames = list(NULL, cnames))
} else if (n_group == 1) {
matrix(ret, nrow = n_row, byrow = TRUE, dimnames = list(NULL, cnames))
} else {
arr <- aperm(
a = array(
data = ret,
dim = c(n_col1, n_group, n_row),
dimnames = list(cnames, NULL, NULL)
),
perm = c(2, 3, 1) # [group, row, col]
)
lapply(seq_len(n_group), function(g) arr[g, , ])
cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
if (predcontrib) {
dimnames(arr) <- list(cnames, NULL, NULL)
if (!strict_shape) {
arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col]
}
} else if (predinteraction) {
n_col1 <- ncol(newdata) + 1
n_group <- npred_per_case / n_col1^2
cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
ret <- if (n_ret == n_row) {
matrix(ret, ncol = 1, dimnames = list(NULL, cnames))
} else if (n_group == 1) {
aperm(
a = array(
data = ret,
dim = c(n_col1, n_col1, n_row),
dimnames = list(cnames, cnames, NULL)
),
perm = c(3, 1, 2)
)
} else {
arr <- aperm(
a = array(
data = ret,
dim = c(n_col1, n_col1, n_group, n_row),
dimnames = list(cnames, cnames, NULL, NULL)
),
perm = c(3, 4, 1, 2) # [group, row, col1, col2]
)
lapply(seq_len(n_group), function(g) arr[g, , , ])
dimnames(arr) <- list(cnames, cnames, NULL, NULL)
if (!strict_shape) {
arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col]
}
} else if (reshape && npred_per_case > 1) {
ret <- matrix(ret, nrow = n_row, byrow = TRUE)
}
return(ret)

if (!strict_shape) {
n_groups <- shape[2]
if (predleaf) {
arr <- matrix(arr, nrow = n_row, byrow = TRUE)
} else if (predcontrib && n_groups != 1) {
arr <- lapply(seq_len(n_groups), function(g) arr[g, , ])
} else if (predinteraction && n_groups != 1) {
arr <- lapply(seq_len(n_groups), function(g) arr[g, , , ])
} else if (!reshape && n_groups != 1) {
arr <- ret
} else if (reshape && n_groups != 1) {
arr <- matrix(arr, ncol = 3, byrow = TRUE)
}
arr <- drop(arr)
if (length(dim(arr)) == 1) {
arr <- as.vector(arr)
} else if (length(dim(arr)) == 2) {
arr <- as.matrix(arr)
}
}
return(arr)
}

#' @rdname predict.xgb.Booster
Expand Down
4 changes: 1 addition & 3 deletions R-package/R/xgb.cv.R
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,7 @@
#' parameter or randomly generated.
#' \item \code{best_iteration} iteration number with the best evaluation metric value
#' (only available with early stopping).
#' \item \code{best_ntreelimit} the \code{ntreelimit} value corresponding to the best iteration,
#' which could further be used in \code{predict} method
#' (only available with early stopping).
#' \item \code{best_ntreelimit} and the \code{ntreelimit} Deprecated attributes, use \code{best_iteration} instead.
#' \item \code{pred} CV prediction values available when \code{prediction} is set.
#' It is either vector or matrix (see \code{\link{cb.cv.predict}}).
#' \item \code{models} a list of the CV folds' models. It is only available with the explicit
Expand Down
3 changes: 0 additions & 3 deletions R-package/R/xgb.train.R
Original file line number Diff line number Diff line change
Expand Up @@ -171,9 +171,6 @@
#' explicitly passed.
#' \item \code{best_iteration} iteration number with the best evaluation metric value
#' (only available with early stopping).
#' \item \code{best_ntreelimit} the \code{ntreelimit} value corresponding to the best iteration,
#' which could further be used in \code{predict} method
#' (only available with early stopping).
#' \item \code{best_score} the best evaluation metric value during early stopping.
#' (only available with early stopping).
#' \item \code{feature_names} names of the training dataset features
Expand Down
3 changes: 0 additions & 3 deletions R-package/man/cb.early.stop.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit b56614e

Please sign in to comment.