diff --git a/DESCRIPTION b/DESCRIPTION index cfa1443bb..8619a5a60 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: parsnip Title: A Common API to Modeling and Analysis Functions -Version: 1.0.2.9003 +Version: 1.0.2.9004 Authors@R: c( person("Max", "Kuhn", , "max@rstudio.com", role = c("aut", "cre")), person("Davis", "Vaughan", , "davis@rstudio.com", role = "aut"), diff --git a/NAMESPACE b/NAMESPACE index 77a011c37..a64bb4d0e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,6 +9,7 @@ S3method(fit,model_spec) S3method(fit_xy,decision_tree) S3method(fit_xy,gen_additive_mod) S3method(fit_xy,model_spec) +S3method(fit_xy,rand_forest) S3method(glance,model_fit) S3method(has_multi_predict,default) S3method(has_multi_predict,model_fit) diff --git a/R/rand_forest.R b/R/rand_forest.R index 8655a7d9e..4880d4464 100644 --- a/R/rand_forest.R +++ b/R/rand_forest.R @@ -163,3 +163,25 @@ check_args.rand_forest <- function(object) { # move translate checks here? invisible(object) } + +# ------------------------------------------------------------------------------ + +#' @export +fit_xy.rand_forest <- function(object, + x, + y, + case_weights = NULL, + control = parsnip::control_parsnip(), + ...) { + + if (object$mode == "censored regression" && object$engine == "aorsf") { + # CRAN aorsf::orsf() requires two variables on the left-hand side of the formula, + # either in as `Surv(time, status) ~ .` or as `time + status ~ .` + # see https://github.com/ropensci/aorsf/issues/11 + rlang::abort("For the `'aorsf'` engine, please use the formula interface via `fit()`.") + } + + # call parsnip::fit_xy.model_spec() + res <- NextMethod() + res +} diff --git a/R/rand_forest_aorsf.R b/R/rand_forest_aorsf.R new file mode 100644 index 000000000..b1b6d8d44 --- /dev/null +++ b/R/rand_forest_aorsf.R @@ -0,0 +1,13 @@ +#' Oblique random survival forests via aorsf +#' +#' [aorsf::orsf()] fits a model that creates a large number of decision +#' trees, each de-correlated from the others. The final prediction uses all +#' predictions from the individual trees and combines them. +#' +#' @includeRmd man/rmd/rand_forest_aorsf.md details +#' +#' @name details_rand_forest_aorsf +#' @keywords internal +NULL + +# See inst/README-DOCS.md for a description of how these files are processed diff --git a/R/tunable.R b/R/tunable.R index 5f2d143f8..d203c88cb 100644 --- a/R/tunable.R +++ b/R/tunable.R @@ -159,6 +159,18 @@ partykit_engine_args <- component_id = "engine" ) +aorsf_engine_args <- + tibble::tibble( + name = c( + "split_min_stat" + ), + call_info = list( + list(pkg = "dials", fun = "conditional_min_criterion") + ), + source = "model_spec", + component = "rand_forest", + component_id = "engine" + ) earth_engine_args <- tibble::tibble( @@ -284,6 +296,8 @@ tunable_rand_forest <- function(x, ...) { res <- add_engine_parameters(res, randomForest_engine_args) } else if (x$engine == "partykit") { res <- add_engine_parameters(res, partykit_engine_args) + } else if (x$engine == "aorsf") { + res <- add_engine_parameters(res, aorsf_engine_args) } res } diff --git a/inst/models.tsv b/inst/models.tsv index f56754593..cc0a2851c 100644 --- a/inst/models.tsv +++ b/inst/models.tsv @@ -105,6 +105,7 @@ "poisson_reg" "regression" "zeroinfl" "poissonreg" "proportional_hazards" "censored regression" "glmnet" "censored" "proportional_hazards" "censored regression" "survival" "censored" +"rand_forest" "censored regression" "aorsf" "censored" "rand_forest" "censored regression" "partykit" "censored" "rand_forest" "classification" "h2o" "agua" "rand_forest" "classification" "partykit" "bonsai" diff --git a/man/details_rand_forest_aorsf.Rd b/man/details_rand_forest_aorsf.Rd new file mode 100644 index 000000000..34c96c3f3 --- /dev/null +++ b/man/details_rand_forest_aorsf.Rd @@ -0,0 +1,79 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rand_forest_aorsf.R +\name{details_rand_forest_aorsf} +\alias{details_rand_forest_aorsf} +\title{Oblique random survival forests via aorsf} +\description{ +\code{\link[aorsf:orsf]{aorsf::orsf()}} fits a model that creates a large number of decision +trees, each de-correlated from the others. The final prediction uses all +predictions from the individual trees and combines them. +} +\details{ +For this engine, there is a single mode: censored regression +\subsection{Tuning Parameters}{ + +This model has 3 tuning parameters: +\itemize{ +\item \code{trees}: # Trees (type: integer, default: 500L) +\item \code{min_n}: Minimal Node Size (type: integer, default: 5L) +\item \code{mtry}: # Randomly Selected Predictors (type: integer, default: +ceiling(sqrt(n_predictors))) +} + +Additionally, this model has one engine-specific tuning parameter: +\itemize{ +\item \code{split_min_stat}: Minimum test statistic required to split a node. +Default is \code{3.841459} for the log-rank test, which is roughly a +p-value of 0.05. +} +} +} +\section{Translation from parsnip to the original package (censored regression)}{ +\if{html}{\out{
}}\preformatted{library(censored) + +rand_forest() \%>\% + set_engine("aorsf") \%>\% + set_mode("censored regression") \%>\% + translate() +}\if{html}{\out{
}} + +\if{html}{\out{
}}\preformatted{## Random Forest Model Specification (censored regression) +## +## Computational engine: aorsf +## +## Model fit template: +## aorsf::orsf(formula = missing_arg(), data = missing_arg(), weights = missing_arg()) +}\if{html}{\out{
}} +\subsection{Preprocessing requirements}{ + +This engine does not require any special encoding of the predictors. +Categorical predictors can be partitioned into groups of factor levels +(e.g. \verb{\{a, c\}} vs \verb{\{b, d\}}) when splitting at a node. Dummy variables +are not required for this model. +} + +\subsection{Other details}{ + +Predictions of survival probability at a time exceeding the maximum +observed event time are the predicted survival probability at the +maximum observed time in the training data. +} + +\subsection{References}{ +\itemize{ +\item Jaeger BC, Long DL, Long DM, Sims M, Szychowski JM, Min YI, Mcclure +LA, Howard G, Simon N. Oblique random survival forests. Annals of +applied statistics 2019 Sep; 13(3):1847-83. DOI: 10.1214/19-AOAS1261 +\item Jaeger BC, Welden S, Lenoir K, Pajewski NM. aorsf: An R package for +supervised learning using the oblique random survival forest. +Journal of Open Source Software 2022, 7(77), 1 4705. +\url{https://doi.org/10.21105/joss.04705}. +\item Jaeger BC, Welden S, Lenoir K, Speiser JL, Segar MW, Pandey A, +Pajewski NM. Accelerated and interpretable oblique random survival +forests. arXiv e-prints 2022 Aug; arXiv-2208. URL: +\url{https://arxiv.org/abs/2208.01129} +} +} +} + +\keyword{internal} diff --git a/man/rmd/rand_forest_aorsf.Rmd b/man/rmd/rand_forest_aorsf.Rmd new file mode 100644 index 000000000..5a31d9ee3 --- /dev/null +++ b/man/rmd/rand_forest_aorsf.Rmd @@ -0,0 +1,65 @@ +```{r, child = "aaa.Rmd", include = FALSE} +``` + +`r descr_models("rand_forest", "aorsf")` + +## Tuning Parameters + +```{r aorsf-param-info, echo = FALSE} +defaults <- + tibble::tibble(parsnip = c("trees", "min_n", "mtry"), + default = c("500L", "5L", "ceiling(sqrt(n_predictors))")) + +param <- + rand_forest() %>% + set_engine("aorsf") %>% + set_mode("censored regression") %>% + make_parameter_list(defaults) %>% + distinct() +``` + +This model has `r nrow(param)` tuning parameters: + +```{r aorsf-param-list, echo = FALSE, results = "asis"} +param$item +``` + +Additionally, this model has one engine-specific tuning parameter: + + * `split_min_stat`: Minimum test statistic required to split a node. Default is `3.841459` for the log-rank test, which is roughly a p-value of 0.05. + + +# Translation from parsnip to the original package (censored regression) + +`r uses_extension("rand_forest", "aorsf", "censored regression")` + +```{r aorsf-creg} +library(censored) + +rand_forest() %>% + set_engine("aorsf") %>% + set_mode("censored regression") %>% + translate() +``` + +## Preprocessing requirements + +```{r child = "template-tree-split-factors.Rmd"} +``` + +## Case weights + +```{r child = "template-uses-case-weights.Rmd"} +``` + +## Other details + +Predictions of survival probability at a time exceeding the maximum observed event time are the predicted survival probability at the maximum observed time in the training data. + +## References + +- Jaeger BC, Long DL, Long DM, Sims M, Szychowski JM, Min YI, Mcclure LA, Howard G, Simon N. Oblique random survival forests. Annals of applied statistics 2019 Sep; 13(3):1847-83. DOI: 10.1214/19-AOAS1261 + +- Jaeger BC, Welden S, Lenoir K, Pajewski NM. aorsf: An R package for supervised learning using the oblique random survival forest. Journal of Open Source Software 2022, 7(77), 1 4705. https://doi.org/10.21105/joss.04705. + +- Jaeger BC, Welden S, Lenoir K, Speiser JL, Segar MW, Pandey A, Pajewski NM. Accelerated and interpretable oblique random survival forests. arXiv e-prints 2022 Aug; arXiv-2208. URL: https://arxiv.org/abs/2208.01129 diff --git a/man/rmd/rand_forest_aorsf.md b/man/rmd/rand_forest_aorsf.md new file mode 100644 index 000000000..1c56f8fea --- /dev/null +++ b/man/rmd/rand_forest_aorsf.md @@ -0,0 +1,68 @@ + + + +For this engine, there is a single mode: censored regression + +## Tuning Parameters + + + +This model has 3 tuning parameters: + +- `trees`: # Trees (type: integer, default: 500L) + +- `min_n`: Minimal Node Size (type: integer, default: 5L) + +- `mtry`: # Randomly Selected Predictors (type: integer, default: ceiling(sqrt(n_predictors))) + +Additionally, this model has one engine-specific tuning parameter: + + * `split_min_stat`: Minimum test statistic required to split a node. Default is `3.841459` for the log-rank test, which is roughly a p-value of 0.05. + + +# Translation from parsnip to the original package (censored regression) + +The **censored** extension package is required to fit this model. + + +```r +library(censored) + +rand_forest() %>% + set_engine("aorsf") %>% + set_mode("censored regression") %>% + translate() +``` + +``` +## Random Forest Model Specification (censored regression) +## +## Computational engine: aorsf +## +## Model fit template: +## aorsf::orsf(formula = missing_arg(), data = missing_arg(), weights = missing_arg()) +``` + +## Preprocessing requirements + + +This engine does not require any special encoding of the predictors. Categorical predictors can be partitioned into groups of factor levels (e.g. `{a, c}` vs `{b, d}`) when splitting at a node. Dummy variables are not required for this model. + +## Case weights + + +This model can utilize case weights during model fitting. To use them, see the documentation in [case_weights] and the examples on `tidymodels.org`. + +The `fit()` and `fit_xy()` arguments have arguments called `case_weights` that expect vectors of case weights. + +## Other details + +Predictions of survival probability at a time exceeding the maximum observed event time are the predicted survival probability at the maximum observed time in the training data. + +## References + +- Jaeger BC, Long DL, Long DM, Sims M, Szychowski JM, Min YI, Mcclure LA, Howard G, Simon N. Oblique random survival forests. Annals of applied statistics 2019 Sep; 13(3):1847-83. DOI: 10.1214/19-AOAS1261 + +- Jaeger BC, Welden S, Lenoir K, Pajewski NM. aorsf: An R package for supervised learning using the oblique random survival forest. Journal of Open Source Software 2022, 7(77), 1 4705. https://doi.org/10.21105/joss.04705. + +- Jaeger BC, Welden S, Lenoir K, Speiser JL, Segar MW, Pandey A, Pajewski NM. Accelerated and interpretable oblique random survival forests. arXiv e-prints 2022 Aug; arXiv-2208. URL: https://arxiv.org/abs/2208.01129