From e687b7f0c1df5fdefda250097862fd2efa73791d Mon Sep 17 00:00:00 2001 From: bcjaeger Date: Mon, 10 Oct 2022 10:14:35 -0400 Subject: [PATCH 1/8] rand_forest_aorsf included in docs --- R/rand_forest_aorsf.R | 13 +++++++++ man/rmd/rand_forest_aorsf.Rmd | 54 +++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 R/rand_forest_aorsf.R create mode 100644 man/rmd/rand_forest_aorsf.Rmd diff --git a/R/rand_forest_aorsf.R b/R/rand_forest_aorsf.R new file mode 100644 index 000000000..b1b6d8d44 --- /dev/null +++ b/R/rand_forest_aorsf.R @@ -0,0 +1,13 @@ +#' Oblique random survival forests via aorsf +#' +#' [aorsf::orsf()] fits a model that creates a large number of decision +#' trees, each de-correlated from the others. The final prediction uses all +#' predictions from the individual trees and combines them. +#' +#' @includeRmd man/rmd/rand_forest_aorsf.md details +#' +#' @name details_rand_forest_aorsf +#' @keywords internal +NULL + +# See inst/README-DOCS.md for a description of how these files are processed diff --git a/man/rmd/rand_forest_aorsf.Rmd b/man/rmd/rand_forest_aorsf.Rmd new file mode 100644 index 000000000..6724e52a7 --- /dev/null +++ b/man/rmd/rand_forest_aorsf.Rmd @@ -0,0 +1,54 @@ +```{r, child = "aaa.Rmd", include = FALSE} +``` + + +`r descr_models("rand_forest", "aorsf")` + +## Tuning Parameters + +```{r aorsf-param-info, echo = FALSE} +defaults <- + tibble::tibble(parsnip = c("trees", "min_n", "mtry"), + default = c("500L", "5L", "ceiling(sqrt(n_predictors))")) + +param <- + rand_forest() %>% + set_engine("aorsf") %>% + set_mode("censored regression") %>% + make_parameter_list(defaults) %>% + distinct() +``` + +This model has `r nrow(param)` tuning parameters: + + ```{r aorsf-param-list, echo = FALSE, results = "asis"} +param$item +``` +# Translation from parsnip to the original package (censored regression) + +`r uses_extension("rand_forest", "aorsf", "censored regression")` + +```{r aorsf-creg} +library(censored) + +rand_forest() %>% + set_engine("aorsf") %>% + set_mode("censored regression") %>% + translate() +``` + +## Preprocessing requirements + +This engine does not require any special encoding of the predictors. Dummy variables are not required for this model. + +## Other details + +The default behavior of the `aorsf` R package is to throw an error message if an `orsf` model is asked to predict survival at a time that exceeds the maximum observed time in its training data. However, for consistency with other engines in `tidymodels`, if a random forest model fitted with the `aorsf` engine is asked to predict survival at a time exceeding the maximum observed event time, it will return a predicted survival probability at the maximum observed time in its training data. + +## References + +- Jaeger BC, Long DL, Long DM, Sims M, Szychowski JM, Min YI, Mcclure LA, Howard G, Simon N. Oblique random survival forests. Annals of applied statistics 2019 Sep; 13(3):1847-83. DOI: 10.1214/19-AOAS1261 + +- Jaeger BC, Welden S, Lenoir K, Pajewski NM. aorsf: An R package for supervised learning using the oblique random survival forest. Journal of Open Source Software 2022, 7(77), 1 4705. https://doi.org/10.21105/joss.04705. + +- Jaeger BC, Welden S, Lenoir K, Speiser JL, Segar MW, Pandey A, Pajewski NM. Accelerated and interpretable oblique random survival forests. arXiv e-prints 2022 Aug; arXiv-2208. URL: https://arxiv.org/abs/2208.01129 From 968a09febbf898fcbd851923c19e903f5961282f Mon Sep 17 00:00:00 2001 From: Hannah Frick Date: Tue, 1 Nov 2022 18:34:15 +0000 Subject: [PATCH 2/8] make engine arg tunable --- R/tunable.R | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/R/tunable.R b/R/tunable.R index 602d5eaaa..65cff10b4 100644 --- a/R/tunable.R +++ b/R/tunable.R @@ -146,6 +146,18 @@ partykit_engine_args <- component_id = "engine" ) +aorsf_engine_args <- + tibble::tibble( + name = c( + "split_min_stat" + ), + call_info = list( + list(pkg = "dials", fun = "conditional_min_criterion") + ), + source = "model_spec", + component = "rand_forest", + component_id = "engine" + ) earth_engine_args <- tibble::tibble( @@ -259,6 +271,8 @@ tunable_rand_forest <- function(x, ...) { res <- add_engine_parameters(res, randomForest_engine_args) } else if (x$engine == "partykit") { res <- add_engine_parameters(res, partykit_engine_args) + } else if (x$engine == "aorsf") { + res <- add_engine_parameters(res, aorsf_engine_args) } res } From 9f6a5befad6b768e8b6f744e73e31e3c8ec96c52 Mon Sep 17 00:00:00 2001 From: Hannah Frick Date: Tue, 1 Nov 2022 18:40:07 +0000 Subject: [PATCH 3/8] add `fit_xy()` method for `rand_forest()` so that it can error for the `aorsf` engine --- DESCRIPTION | 2 +- R/rand_forest.R | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index a33d1e3c7..d2ec9343c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: parsnip Title: A Common API to Modeling and Analysis Functions -Version: 1.0.2.9000 +Version: 1.0.2.9004 Authors@R: c( person("Max", "Kuhn", , "max@rstudio.com", role = c("aut", "cre")), person("Davis", "Vaughan", , "davis@rstudio.com", role = "aut"), diff --git a/R/rand_forest.R b/R/rand_forest.R index 8655a7d9e..6cb05cb20 100644 --- a/R/rand_forest.R +++ b/R/rand_forest.R @@ -163,3 +163,24 @@ check_args.rand_forest <- function(object) { # move translate checks here? invisible(object) } + +# ------------------------------------------------------------------------------ + +#' @export +fit_xy.rand_forest <- function(object, + x, + y, + case_weights = NULL, + control = parsnip::control_parsnip(), + ...) { + + if (object$mode == "censored regression" && object$engine == "aorsf") { + # aorsf::orsf() requires two variables on the left-hand side of the formula, + # either in as `Surv(time, status) ~ .` or as `time + status ~ .` + rlang::abort("For the `'aorsf'` engine, please use the formula interface via `fit()`.") + } + + # call parsnip::fit_xy.model_spec() + res <- NextMethod() + res +} From 35eeda1a21f9c15349851c4c91d938ce922e6043 Mon Sep 17 00:00:00 2001 From: Hannah Frick Date: Tue, 1 Nov 2022 18:40:44 +0000 Subject: [PATCH 4/8] Update `models.tsv` --- inst/models.tsv | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/models.tsv b/inst/models.tsv index 76cd99185..b4412c9fc 100644 --- a/inst/models.tsv +++ b/inst/models.tsv @@ -105,6 +105,7 @@ "poisson_reg" "regression" "zeroinfl" "poissonreg" "proportional_hazards" "censored regression" "glmnet" "censored" "proportional_hazards" "censored regression" "survival" "censored" +"rand_forest" "censored regression" "aorsf" "censored" "rand_forest" "censored regression" "partykit" "censored" "rand_forest" "classification" "h2o" "agua" "rand_forest" "classification" "partykit" "bonsai" From 4632852d1bf6ca52be55f151b4ef072ec2588d00 Mon Sep 17 00:00:00 2001 From: Hannah Frick Date: Tue, 1 Nov 2022 18:41:31 +0000 Subject: [PATCH 5/8] Update engine docs for `aorsf` --- man/rmd/rand_forest_aorsf.Rmd | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/man/rmd/rand_forest_aorsf.Rmd b/man/rmd/rand_forest_aorsf.Rmd index 6724e52a7..e4fc5d352 100644 --- a/man/rmd/rand_forest_aorsf.Rmd +++ b/man/rmd/rand_forest_aorsf.Rmd @@ -1,7 +1,6 @@ ```{r, child = "aaa.Rmd", include = FALSE} ``` - `r descr_models("rand_forest", "aorsf")` ## Tuning Parameters @@ -21,9 +20,15 @@ param <- This model has `r nrow(param)` tuning parameters: - ```{r aorsf-param-list, echo = FALSE, results = "asis"} +```{r aorsf-param-list, echo = FALSE, results = "asis"} param$item ``` + +Additionally, this model has one engine-specific tuning parameter: + + * `split_min_stat`: Minimum test statistic required to split a node. Default is `3.841459` for the log-rank test, which is roughly a p-value of 0.05. + + # Translation from parsnip to the original package (censored regression) `r uses_extension("rand_forest", "aorsf", "censored regression")` @@ -39,11 +44,12 @@ rand_forest() %>% ## Preprocessing requirements -This engine does not require any special encoding of the predictors. Dummy variables are not required for this model. +```{r child = "template-tree-split-factors.Rmd"} +``` ## Other details -The default behavior of the `aorsf` R package is to throw an error message if an `orsf` model is asked to predict survival at a time that exceeds the maximum observed time in its training data. However, for consistency with other engines in `tidymodels`, if a random forest model fitted with the `aorsf` engine is asked to predict survival at a time exceeding the maximum observed event time, it will return a predicted survival probability at the maximum observed time in its training data. +Predictions of survival probability at a time exceeding the maximum observed event time are the predicted survival probability at the maximum observed time in the training data. ## References From 6416c8595b9629ec7585c61993a850df67f67a17 Mon Sep 17 00:00:00 2001 From: Hannah Frick Date: Tue, 1 Nov 2022 18:42:20 +0000 Subject: [PATCH 6/8] re-document to update `.md` and `.Rd` --- NAMESPACE | 1 + man/details_rand_forest_aorsf.Rd | 79 ++++++++++++++++++++++++++++++++ man/rmd/rand_forest_aorsf.md | 61 ++++++++++++++++++++++++ 3 files changed, 141 insertions(+) create mode 100644 man/details_rand_forest_aorsf.Rd create mode 100644 man/rmd/rand_forest_aorsf.md diff --git a/NAMESPACE b/NAMESPACE index e977f8311..8b1e1a49f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,7 @@ S3method(extract_spec_parsnip,model_fit) S3method(fit,model_spec) S3method(fit_xy,gen_additive_mod) S3method(fit_xy,model_spec) +S3method(fit_xy,rand_forest) S3method(glance,model_fit) S3method(has_multi_predict,default) S3method(has_multi_predict,model_fit) diff --git a/man/details_rand_forest_aorsf.Rd b/man/details_rand_forest_aorsf.Rd new file mode 100644 index 000000000..34c96c3f3 --- /dev/null +++ b/man/details_rand_forest_aorsf.Rd @@ -0,0 +1,79 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rand_forest_aorsf.R +\name{details_rand_forest_aorsf} +\alias{details_rand_forest_aorsf} +\title{Oblique random survival forests via aorsf} +\description{ +\code{\link[aorsf:orsf]{aorsf::orsf()}} fits a model that creates a large number of decision +trees, each de-correlated from the others. The final prediction uses all +predictions from the individual trees and combines them. +} +\details{ +For this engine, there is a single mode: censored regression +\subsection{Tuning Parameters}{ + +This model has 3 tuning parameters: +\itemize{ +\item \code{trees}: # Trees (type: integer, default: 500L) +\item \code{min_n}: Minimal Node Size (type: integer, default: 5L) +\item \code{mtry}: # Randomly Selected Predictors (type: integer, default: +ceiling(sqrt(n_predictors))) +} + +Additionally, this model has one engine-specific tuning parameter: +\itemize{ +\item \code{split_min_stat}: Minimum test statistic required to split a node. +Default is \code{3.841459} for the log-rank test, which is roughly a +p-value of 0.05. +} +} +} +\section{Translation from parsnip to the original package (censored regression)}{ +\if{html}{\out{
}}\preformatted{library(censored) + +rand_forest() \%>\% + set_engine("aorsf") \%>\% + set_mode("censored regression") \%>\% + translate() +}\if{html}{\out{
}} + +\if{html}{\out{
}}\preformatted{## Random Forest Model Specification (censored regression) +## +## Computational engine: aorsf +## +## Model fit template: +## aorsf::orsf(formula = missing_arg(), data = missing_arg(), weights = missing_arg()) +}\if{html}{\out{
}} +\subsection{Preprocessing requirements}{ + +This engine does not require any special encoding of the predictors. +Categorical predictors can be partitioned into groups of factor levels +(e.g. \verb{\{a, c\}} vs \verb{\{b, d\}}) when splitting at a node. Dummy variables +are not required for this model. +} + +\subsection{Other details}{ + +Predictions of survival probability at a time exceeding the maximum +observed event time are the predicted survival probability at the +maximum observed time in the training data. +} + +\subsection{References}{ +\itemize{ +\item Jaeger BC, Long DL, Long DM, Sims M, Szychowski JM, Min YI, Mcclure +LA, Howard G, Simon N. Oblique random survival forests. Annals of +applied statistics 2019 Sep; 13(3):1847-83. DOI: 10.1214/19-AOAS1261 +\item Jaeger BC, Welden S, Lenoir K, Pajewski NM. aorsf: An R package for +supervised learning using the oblique random survival forest. +Journal of Open Source Software 2022, 7(77), 1 4705. +\url{https://doi.org/10.21105/joss.04705}. +\item Jaeger BC, Welden S, Lenoir K, Speiser JL, Segar MW, Pandey A, +Pajewski NM. Accelerated and interpretable oblique random survival +forests. arXiv e-prints 2022 Aug; arXiv-2208. URL: +\url{https://arxiv.org/abs/2208.01129} +} +} +} + +\keyword{internal} diff --git a/man/rmd/rand_forest_aorsf.md b/man/rmd/rand_forest_aorsf.md new file mode 100644 index 000000000..398cc595a --- /dev/null +++ b/man/rmd/rand_forest_aorsf.md @@ -0,0 +1,61 @@ + + + +For this engine, there is a single mode: censored regression + +## Tuning Parameters + + + +This model has 3 tuning parameters: + +- `trees`: # Trees (type: integer, default: 500L) + +- `min_n`: Minimal Node Size (type: integer, default: 5L) + +- `mtry`: # Randomly Selected Predictors (type: integer, default: ceiling(sqrt(n_predictors))) + +Additionally, this model has one engine-specific tuning parameter: + + * `split_min_stat`: Minimum test statistic required to split a node. Default is `3.841459` for the log-rank test, which is roughly a p-value of 0.05. + + +# Translation from parsnip to the original package (censored regression) + + + + +```r +library(censored) + +rand_forest() %>% + set_engine("aorsf") %>% + set_mode("censored regression") %>% + translate() +``` + +``` +## Random Forest Model Specification (censored regression) +## +## Computational engine: aorsf +## +## Model fit template: +## aorsf::orsf(formula = missing_arg(), data = missing_arg(), weights = missing_arg()) +``` + +## Preprocessing requirements + + +This engine does not require any special encoding of the predictors. Categorical predictors can be partitioned into groups of factor levels (e.g. `{a, c}` vs `{b, d}`) when splitting at a node. Dummy variables are not required for this model. + +## Other details + +Predictions of survival probability at a time exceeding the maximum observed event time are the predicted survival probability at the maximum observed time in the training data. + +## References + +- Jaeger BC, Long DL, Long DM, Sims M, Szychowski JM, Min YI, Mcclure LA, Howard G, Simon N. Oblique random survival forests. Annals of applied statistics 2019 Sep; 13(3):1847-83. DOI: 10.1214/19-AOAS1261 + +- Jaeger BC, Welden S, Lenoir K, Pajewski NM. aorsf: An R package for supervised learning using the oblique random survival forest. Journal of Open Source Software 2022, 7(77), 1 4705. https://doi.org/10.21105/joss.04705. + +- Jaeger BC, Welden S, Lenoir K, Speiser JL, Segar MW, Pandey A, Pajewski NM. Accelerated and interpretable oblique random survival forests. arXiv e-prints 2022 Aug; arXiv-2208. URL: https://arxiv.org/abs/2208.01129 From 9683be4442c222ac070dc0ed0bcd308b1c881e04 Mon Sep 17 00:00:00 2001 From: Hannah Frick Date: Thu, 3 Nov 2022 18:29:15 +0000 Subject: [PATCH 7/8] add note on case weights --- man/rmd/rand_forest_aorsf.Rmd | 5 +++++ man/rmd/rand_forest_aorsf.md | 9 ++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/man/rmd/rand_forest_aorsf.Rmd b/man/rmd/rand_forest_aorsf.Rmd index e4fc5d352..5a31d9ee3 100644 --- a/man/rmd/rand_forest_aorsf.Rmd +++ b/man/rmd/rand_forest_aorsf.Rmd @@ -47,6 +47,11 @@ rand_forest() %>% ```{r child = "template-tree-split-factors.Rmd"} ``` +## Case weights + +```{r child = "template-uses-case-weights.Rmd"} +``` + ## Other details Predictions of survival probability at a time exceeding the maximum observed event time are the predicted survival probability at the maximum observed time in the training data. diff --git a/man/rmd/rand_forest_aorsf.md b/man/rmd/rand_forest_aorsf.md index 398cc595a..1c56f8fea 100644 --- a/man/rmd/rand_forest_aorsf.md +++ b/man/rmd/rand_forest_aorsf.md @@ -22,7 +22,7 @@ Additionally, this model has one engine-specific tuning parameter: # Translation from parsnip to the original package (censored regression) - +The **censored** extension package is required to fit this model. ```r @@ -48,6 +48,13 @@ rand_forest() %>% This engine does not require any special encoding of the predictors. Categorical predictors can be partitioned into groups of factor levels (e.g. `{a, c}` vs `{b, d}`) when splitting at a node. Dummy variables are not required for this model. +## Case weights + + +This model can utilize case weights during model fitting. To use them, see the documentation in [case_weights] and the examples on `tidymodels.org`. + +The `fit()` and `fit_xy()` arguments have arguments called `case_weights` that expect vectors of case weights. + ## Other details Predictions of survival probability at a time exceeding the maximum observed event time are the predicted survival probability at the maximum observed time in the training data. From 639fbdb966006dc7bf0bb932b1cf23c7a483a280 Mon Sep 17 00:00:00 2001 From: Hannah Frick Date: Thu, 3 Nov 2022 18:30:46 +0000 Subject: [PATCH 8/8] update status this should be removed once the changes in aorsf are on CRAN --- R/rand_forest.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/rand_forest.R b/R/rand_forest.R index 6cb05cb20..4880d4464 100644 --- a/R/rand_forest.R +++ b/R/rand_forest.R @@ -175,8 +175,9 @@ fit_xy.rand_forest <- function(object, ...) { if (object$mode == "censored regression" && object$engine == "aorsf") { - # aorsf::orsf() requires two variables on the left-hand side of the formula, + # CRAN aorsf::orsf() requires two variables on the left-hand side of the formula, # either in as `Surv(time, status) ~ .` or as `time + status ~ .` + # see https://github.com/ropensci/aorsf/issues/11 rlang::abort("For the `'aorsf'` engine, please use the formula interface via `fit()`.") }