From 8a1e2d624adbaed5c031b00261f317c0f93f92b4 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Thu, 14 Sep 2023 16:18:30 -0700 Subject: [PATCH 01/22] start CDC baseline layer --- R/layer_cdc_flatline_quantiles.R | 102 +++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 R/layer_cdc_flatline_quantiles.R diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R new file mode 100644 index 000000000..0e2fa4f1b --- /dev/null +++ b/R/layer_cdc_flatline_quantiles.R @@ -0,0 +1,102 @@ +layer_cdc_flatline_quantiles <- function( + frosting, + ..., + aheads = 1:4, + quantiles = c(.01, .025, 1:19 / 20, .975, .99), + nsims = 1e5, + by_key = "geo_value", + symmetrize = FALSE, + id = rand_id("cdc_baseline_bands")) { + + rlang::check_dots_empty() + + arg_is_int(aheads) + arg_is_probabilities(quantiles) + arg_is_pos_int(nsims) + arg_is_scalar(nsims) + arg_is_chr_scalar(id) + arg_is_lgl_scalar(symmetrize) + arg_is_chr(by_key, allow_null = TRUE, allow_na = TRUE, allow_empty = TRUE) + + add_layer( + frosting, + layer_cdc_flatline_quantiles_new( + aheads = aheads, + quantiles = quantiles, + nsims = nsims, + by_key = by_key, + symmetrize = symmetrize, + id = id + ) + ) +} + +layer_cdc_flatline_quantiles_new <- function( + aheads, + quantiles, + nsims, + by_key, + symmetrize, + id +) { + layer( + "cdc_flatline_quantiles", + aheads, + quantiles, + nsims, + by_key, + symmetrize, + id + ) +} + +#' @export +slather.layer_cdc_flatline_quantiles <- + function(object, components, workflow, new_data, ...) { + the_fit <- workflows::extract_fit_parsnip(workflow) + s <- ifelse(object$symmetrize, -1, NA) + r <- grab_residuals(the_fit, components) + + ## Handle any grouping requests + if (length(object$by_key) > 0L) { + key_cols <- dplyr::bind_cols( + geo_value = components$mold$extras$roles$geo_value, + components$mold$extras$roles$key + ) + common <- intersect(object$by_key, names(key_cols)) + excess <- setdiff(object$by_key, names(key_cols)) + if (length(excess) > 0L) { + rlang::warn( + "Requested residual grouping key(s) {excess} are unavailable ", + "in the original data. Grouping by the remainder: {common}." + ) + } + if (length(common) > 0L) { + r <- r %>% dplyr::select(tidyselect::any_of(c(common, ".resid"))) + common_in_r <- common[common %in% names(r)] + if (length(common_in_r) != length(common)) { + rlang::warn( + "Some grouping keys are not in data.frame returned by the", + "`residuals()` method. Groupings may not be correct." + ) + } + r <- dplyr::bind_cols(key_cols, r) %>% + dplyr::group_by(!!!rlang::syms(common)) + } + } + + + + + + + # always return components + components + } + +propogate_samples <- function(x, p, horizon, nsim, symmetrize) { + samp <- quantile(x, probs = c(0, seq_len(nsim)) / nsim) + + for (iter in seq(horizon)) {} +} + From cea1599dfad47e77c8a8026a66b16963ed7c86dd Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Fri, 22 Sep 2023 10:33:18 -0700 Subject: [PATCH 02/22] upgrade enframer --- R/utils-enframer.R | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/R/utils-enframer.R b/R/utils-enframer.R index 387d04356..0a8152906 100644 --- a/R/utils-enframer.R +++ b/R/utils-enframer.R @@ -13,10 +13,11 @@ enframer <- function(df, x, fill = NA) { } enlist <- function(...) { - # in epiprocess - x <- list(...) - n <- as.character(sys.call())[-1] - if (!is.null(n0 <- names(x))) n[n0 != ""] <- n0[n0 != ""] - names(x) <- n - x + # converted to thin wrapper around + rlang::dots_list( + ..., + .homonyms = "error", + .named = TRUE, + .check_assign = TRUE + ) } From d606741cb05d8546f82ca057e5515a53b19b2a05 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sat, 23 Sep 2023 16:33:50 -0700 Subject: [PATCH 03/22] functions, remains to check validity --- R/compat-purrr.R | 5 + R/layer_cdc_flatline_quantiles.R | 144 +++++++++++++++++------- R/layer_residual_quantiles.R | 8 +- tests/testthat/test-propogate_samples.R | 8 ++ tests/testthat/test-shuffle.R | 5 + 5 files changed, 128 insertions(+), 42 deletions(-) create mode 100644 tests/testthat/test-propogate_samples.R create mode 100644 tests/testthat/test-shuffle.R diff --git a/R/compat-purrr.R b/R/compat-purrr.R index 7e28bd630..712926f73 100644 --- a/R/compat-purrr.R +++ b/R/compat-purrr.R @@ -32,6 +32,11 @@ map_chr <- function(.x, .f, ...) { .rlang_purrr_map_mold(.x, .f, character(1), ...) } +map_vec <- function(.x, .f, ...) { + out <- map(.x, .f, ...) + vctrs::list_unchop(out) +} + map_dfr <- function(.x, .f, ..., .id = NULL) { .f <- rlang::as_function(.f, env = rlang::global_env()) res <- map(.x, .f, ...) diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index 0e2fa4f1b..c64b96c2d 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -6,6 +6,7 @@ layer_cdc_flatline_quantiles <- function( nsims = 1e5, by_key = "geo_value", symmetrize = FALSE, + nonneg = TRUE, id = rand_id("cdc_baseline_bands")) { rlang::check_dots_empty() @@ -15,7 +16,7 @@ layer_cdc_flatline_quantiles <- function( arg_is_pos_int(nsims) arg_is_scalar(nsims) arg_is_chr_scalar(id) - arg_is_lgl_scalar(symmetrize) + arg_is_lgl_scalar(symmetrize, nonneg) arg_is_chr(by_key, allow_null = TRUE, allow_na = TRUE, allow_empty = TRUE) add_layer( @@ -26,6 +27,7 @@ layer_cdc_flatline_quantiles <- function( nsims = nsims, by_key = by_key, symmetrize = symmetrize, + nonneg = nonneg, id = id ) ) @@ -37,16 +39,18 @@ layer_cdc_flatline_quantiles_new <- function( nsims, by_key, symmetrize, + nonneg, id ) { layer( "cdc_flatline_quantiles", - aheads, - quantiles, - nsims, - by_key, - symmetrize, - id + aheads = aheads, + quantiles = quantiles, + nsims = nsims, + by_key = by_key, + symmetrize = symmetrize, + nonneg = nonneg, + id = id ) } @@ -54,49 +58,113 @@ layer_cdc_flatline_quantiles_new <- function( slather.layer_cdc_flatline_quantiles <- function(object, components, workflow, new_data, ...) { the_fit <- workflows::extract_fit_parsnip(workflow) - s <- ifelse(object$symmetrize, -1, NA) + if (!inherits(the_fit, "_flatline")) { + cli::cli_warn( + c("Predictions for this workflow were not produced by the {.cls flatline}", + "{.pkg parsnip} engine. Results may be unexpected. See {.fn epipredict::flatline}.") + ) + } + p <- components$predictions + ek <- kill_time_value(epi_keys_mold(components$mold)) r <- grab_residuals(the_fit, components) - ## Handle any grouping requests + avail_grps <- character(0L) if (length(object$by_key) > 0L) { - key_cols <- dplyr::bind_cols( - geo_value = components$mold$extras$roles$geo_value, - components$mold$extras$roles$key - ) - common <- intersect(object$by_key, names(key_cols)) - excess <- setdiff(object$by_key, names(key_cols)) - if (length(excess) > 0L) { - rlang::warn( - "Requested residual grouping key(s) {excess} are unavailable ", - "in the original data. Grouping by the remainder: {common}." - ) + cols_in_preds <- hardhat::check_column_names(p, object$by_key) + if (!cols_in_preds$ok) { + cli::cli_warn(c( + "Predicted values are missing key columns: {.var cols_in_preds$missing_names}.", + "Ignoring these." + )) } - if (length(common) > 0L) { - r <- r %>% dplyr::select(tidyselect::any_of(c(common, ".resid"))) - common_in_r <- common[common %in% names(r)] - if (length(common_in_r) != length(common)) { - rlang::warn( - "Some grouping keys are not in data.frame returned by the", - "`residuals()` method. Groupings may not be correct." - ) + if (inherits(the_fit, "_flatline")) { + cols_in_resids <- hardhat::check_column_names(r, object$by_key) + if (!cols_in_resids$ok) { + cli::cli_warn(c( + "Existing residuals are missing key columns: {.var cols_in_resids$missing_names}.", + "Ignoring these." + )) + } + # use only the keys that are in the predictions and requested. + avail_grps <- intersect(ek, setdiff( + object$by_key, + c(cols_in_preds$missing_names, cols_in_resids$missing_names) + )) + } else { # not flatline, but we'll try + key_cols <- dplyr::bind_cols( + geo_value = components$mold$extras$roles$geo_value, + components$mold$extras$roles$key + ) + cols_in_resids <- hardhat::check_column_names(key_cols, object$by_key) + if (!cols_in_resids$ok) { + cli::cli_warn(c( + "Requested residuals are missing key columns: {.var cols_in_resids$missing_names}.", + "Ignoring these." + )) } - r <- dplyr::bind_cols(key_cols, r) %>% - dplyr::group_by(!!!rlang::syms(common)) + avail_grps <- intersect(names(key_cols), setdiff( + object$by_key, + c(cols_in_preds$missing_names, cols_in_resids$missing_names) + )) + r <- dplyr::bind_cols(key_cols, r) } } + r <- r %>% + dplyr::select(tidyselect::all_of(c(avail_grps, ".resid"))) %>% + dplyr::group_by(!!!rlang::syms(avail_grps)) %>% + dplyr::summarise(.resid = list(.resid), .groups = "drop") + res <- dplyr::left_join(p, r, by = avail_grps) %>% + dplyr::rowwise() %>% + dplyr::mutate( + .pred_distn_all = propogate_samples( + .resid, .pred, object$quantiles, + object$aheads, object$nsim, object$symmetrize, object$nonneg + ) + ) %>% + dplyr::select(tidyselect::all_of(c(avail_grps, ".pred_distn_all"))) - - - - - # always return components + # res <- check_pname(res, components$predictions, object) + components$predictions <- dplyr::left_join( + components$predictions, + res, + by = avail_grps + ) components } -propogate_samples <- function(x, p, horizon, nsim, symmetrize) { - samp <- quantile(x, probs = c(0, seq_len(nsim)) / nsim) +propogate_samples <- function( + r, p, quantiles, aheads, nsim, symmetrize, nonneg) { + max_ahead <- max(aheads) + samp <- quantile(r, probs = c(0, seq_len(nsim - 1)) / (nsim - 1), na.rm = TRUE) + res <- list() - for (iter in seq(horizon)) {} + # p should be all the same + p <- max(p, na.rm = TRUE) + + raw <- samp + p + if (nonneg) raw <- pmax(0, raw) + res[[1]] <- raw + if (max_ahead > 1L) { + for (iter in 2:max_ahead) { + samp <- shuffle(samp) + raw <- raw + samp + if (symmetrize) symmetric <- raw - (median(raw) + p) + else symmetric <- raw + if (nonneg) symmetric <- pmax(0, symmetric) + res[[iter]] <- symmetric + } + } + res <- res[aheads] + list(tibble::tibble( + aheads = aheads, + .pred_distn = map_vec( + res, ~ dist_quantiles(quantile(.x, quantiles), tau = quantiles) + ) + )) } +shuffle <- function(x) { + stopifnot(is.vector(x)) + sample(x, length(x), replace = FALSE) +} diff --git a/R/layer_residual_quantiles.R b/R/layer_residual_quantiles.R index a9a8cab24..b9a71e265 100644 --- a/R/layer_residual_quantiles.R +++ b/R/layer_residual_quantiles.R @@ -141,8 +141,8 @@ grab_residuals <- function(the_fit, components) { if (".resid" %in% names(r)) { # success return(r) } else { # failure - rlang::warn(c( - "The `residuals()` method for objects of class {cl} results in", + cli::cli_warn(c( + "The `residuals()` method for {.cls cl} objects results in", "a data frame without a column named `.resid`.", i = "Residual quantiles will be calculated directly from the", i = "difference between predictions and observations.", @@ -152,8 +152,8 @@ grab_residuals <- function(the_fit, components) { } else if (is.vector(drop(r))) { # also success return(tibble(.resid = drop(r))) } else { # failure - rlang::warn(c( - "The `residuals()` method for objects of class {cl} results in an", + cli::cli_warn(c( + "The `residuals()` method for {.cls cl} objects results in an", "object that is neither a data frame with a column named `.resid`,", "nor something coercible to a vector.", i = "Residual quantiles will be calculated directly from the", diff --git a/tests/testthat/test-propogate_samples.R b/tests/testthat/test-propogate_samples.R new file mode 100644 index 000000000..3b02404b6 --- /dev/null +++ b/tests/testthat/test-propogate_samples.R @@ -0,0 +1,8 @@ +test_that("propogate_samples", { + r <- -30:50 + p <- 40 + quantiles <- 1:9 / 10 + aheads <- c(2, 4, 7) + nsim <- 100 + +}) diff --git a/tests/testthat/test-shuffle.R b/tests/testthat/test-shuffle.R new file mode 100644 index 000000000..94bc1aa3b --- /dev/null +++ b/tests/testthat/test-shuffle.R @@ -0,0 +1,5 @@ +test_that("shuffle works", { + expect_error(shuffle(matrix(NA, 2, 2))) + expect_length(shuffle(1:10), 10L) + expect_identical(sort(shuffle(1:10)), 1:10) +}) From 7294c000fba0fc2a3dc2298ce654a70879a58627 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sun, 24 Sep 2023 11:19:09 -0700 Subject: [PATCH 04/22] correct symmetrization, enhance documentation of the "ahead" param in `flatline_forecaster()`. --- R/flatline_forecaster.R | 6 ++++++ R/layer_cdc_flatline_quantiles.R | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/R/flatline_forecaster.R b/R/flatline_forecaster.R index e437f50ea..197c8cca5 100644 --- a/R/flatline_forecaster.R +++ b/R/flatline_forecaster.R @@ -94,6 +94,12 @@ flatline_forecaster <- function( #' Constructs a list of arguments for [flatline_forecaster()]. #' #' @inheritParams arx_args_list +#' @param ahead Integer. Unlike [arx_forecaster()], this doesn't have any effect +#' on the predicted values. Predictions are always the most recent observation. +#' However, this _does_ impact the residuals stored in the object. Residuals +#' are calculated based on this number to mimic how badly you would have done. +#' So for example, `ahead = 7` will create residuals by comparing values +#' 7 days apart. #' #' @return A list containing updated parameter choices with class `flatline_alist`. #' @export diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index c64b96c2d..3f178f6da 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -149,7 +149,7 @@ propogate_samples <- function( for (iter in 2:max_ahead) { samp <- shuffle(samp) raw <- raw + samp - if (symmetrize) symmetric <- raw - (median(raw) + p) + if (symmetrize) symmetric <- raw - (median(raw) - p) else symmetric <- raw if (nonneg) symmetric <- pmax(0, symmetric) res[[iter]] <- symmetric @@ -157,7 +157,7 @@ propogate_samples <- function( } res <- res[aheads] list(tibble::tibble( - aheads = aheads, + ahead = aheads, .pred_distn = map_vec( res, ~ dist_quantiles(quantile(.x, quantiles), tau = quantiles) ) From f18e88f7b45602c4bbdf4a26d32252f2203485b2 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sun, 24 Sep 2023 11:29:14 -0700 Subject: [PATCH 05/22] better defaults, cli, pred is scalar in propagate_samples --- R/layer_cdc_flatline_quantiles.R | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index 3f178f6da..f2b55e5ec 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -3,7 +3,7 @@ layer_cdc_flatline_quantiles <- function( ..., aheads = 1:4, quantiles = c(.01, .025, 1:19 / 20, .975, .99), - nsims = 1e5, + nsims = 1e3, by_key = "geo_value", symmetrize = FALSE, nonneg = TRUE, @@ -73,7 +73,7 @@ slather.layer_cdc_flatline_quantiles <- cols_in_preds <- hardhat::check_column_names(p, object$by_key) if (!cols_in_preds$ok) { cli::cli_warn(c( - "Predicted values are missing key columns: {.var cols_in_preds$missing_names}.", + "Predicted values are missing key columns: {.val {cols_in_preds$missing_names}}.", "Ignoring these." )) } @@ -81,7 +81,7 @@ slather.layer_cdc_flatline_quantiles <- cols_in_resids <- hardhat::check_column_names(r, object$by_key) if (!cols_in_resids$ok) { cli::cli_warn(c( - "Existing residuals are missing key columns: {.var cols_in_resids$missing_names}.", + "Existing residuals are missing key columns: {.val {cols_in_resids$missing_names}}.", "Ignoring these." )) } @@ -98,7 +98,7 @@ slather.layer_cdc_flatline_quantiles <- cols_in_resids <- hardhat::check_column_names(key_cols, object$by_key) if (!cols_in_resids$ok) { cli::cli_warn(c( - "Requested residuals are missing key columns: {.var cols_in_resids$missing_names}.", + "Requested residuals are missing key columns: {.val {cols_in_resids$missing_names}}.", "Ignoring these." )) } @@ -139,9 +139,6 @@ propogate_samples <- function( samp <- quantile(r, probs = c(0, seq_len(nsim - 1)) / (nsim - 1), na.rm = TRUE) res <- list() - # p should be all the same - p <- max(p, na.rm = TRUE) - raw <- samp + p if (nonneg) raw <- pmax(0, raw) res[[1]] <- raw From d6a28f371d0f8436b436da2b511a8e02816094c6 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sun, 24 Sep 2023 16:40:31 -0700 Subject: [PATCH 06/22] redocument --- NAMESPACE | 2 + R/layer_cdc_flatline_quantiles.R | 94 +++++++++++++++++++++ man/add_frosting.Rd | 4 +- man/arx_fcast_epi_workflow.Rd | 12 ++- man/arx_forecaster.Rd | 12 ++- man/create_layer.Rd | 6 +- man/dist_quantiles.Rd | 2 +- man/extrapolate_quantiles.Rd | 8 +- man/fit-epi_workflow.Rd | 2 +- man/flatline.Rd | 6 +- man/flatline_args_list.Rd | 8 +- man/frosting.Rd | 4 +- man/layer_add_forecast_date.Rd | 12 +-- man/layer_add_target_date.Rd | 6 +- man/layer_cdc_flatline_quantiles.Rd | 125 ++++++++++++++++++++++++++++ man/layer_population_scaling.Rd | 29 ++++--- man/layer_predict.Rd | 6 +- man/nested_quantiles.Rd | 4 +- man/smooth_quantile_reg.Rd | 27 +++--- man/step_epi_shift.Rd | 2 +- man/step_growth_rate.Rd | 4 +- man/step_lag_difference.Rd | 4 +- man/step_population_scaling.Rd | 26 +++--- man/step_training_window.Rd | 9 +- 24 files changed, 340 insertions(+), 74 deletions(-) create mode 100644 man/layer_cdc_flatline_quantiles.Rd diff --git a/NAMESPACE b/NAMESPACE index b22ec53a5..e361dec00 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -79,6 +79,7 @@ S3method(residuals,flatline) S3method(run_mold,default_epi_recipe_blueprint) S3method(slather,layer_add_forecast_date) S3method(slather,layer_add_target_date) +S3method(slather,layer_cdc_flatline_quantiles) S3method(slather,layer_naomit) S3method(slather,layer_point_from_distn) S3method(slather,layer_population_scaling) @@ -131,6 +132,7 @@ export(is_layer) export(layer) export(layer_add_forecast_date) export(layer_add_target_date) +export(layer_cdc_flatline_quantiles) export(layer_naomit) export(layer_point_from_distn) export(layer_population_scaling) diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index f2b55e5ec..afee37577 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -1,3 +1,97 @@ +#' CDC Flatline Forecast Quantiles +#' +#' This layer creates quantile forecasts by taking a sample from the +#' interpolated CDF of the flatline residuals, and shuffling them. +#' These are then added on to the point prediction. +#' +#' @details +#' This layer is intended to be used in concert with [flatline()]. But it can +#' also be used with anything else. As long as residuals are available in the +#' the fitted model, this layer could be useful. Like +#' [layer_residual_quantiles()] it only uses the residuals for the fitted model +#' object. However, it propagates these forward for *all* aheads, by +#' iteratively shuffling them (randomly), and then adding them to the previous +#' set. This is in contrast to what happens with the [flatline_forecaster()]. +#' When using [flatline()] as the underlying engine (here), both will result in the +#' same predictions (the most recent observed value), but that model calculates +#' separate residuals for each `ahead` by comparing to observations further into +#' the future. This version continues to use the same set of residuals, and +#' adds them on to produce wider intervals as `ahead` increases. +#' +#' +#' @inheritParams layer_residual_quantiles +#' @param aheads Numeric vector of desired forecast horizons. These should be +#' given in the "units of the training data". So, for example, for data +#' typically observed daily (possibly with missing values), but +#' with weekly forecast targets, you would use `c(7, 14, 21, 28)`. But with +#' weekly data, you would use `1:4`. +#' @param quantiles Numeric vector of probabilities with values in (0,1) +#' referring to the desired predictive intervals. The default is the standard +#' set for the COVID Forecast Hub. +#' @param nsims Positive integer. The number of draws from the empirical CDF. +#' These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting +#' in linear interpolation on the X scale. This is achieved with +#' [stats::quantile()] Type 7 (the default for that function). +#' @param nonneg Logical. Force all predictive intervals be non-negative. +#' Because non-negativity is forced _before_ propagating forward, this +#' has slightly different behaviour than would occur if using +#' [layer_threshold_preds()]. +#' +#' @return an updated `frosting` postprocessor. Calling [predict()] will result +#' in an additional `` named `.pred_distn_all` containing 2-column +#' [tibble::tibble()]'s. For each +#' desired combination of `key`'s, the tibble will contain one row per ahead +#' with the associated [dist_quantiles()]. +#' @export +#' +#' @examples +#' r <- epi_recipe(case_death_rate_subset) %>% +#' # data is "daily", so we fit this to 1 ahead, the result will contain +#' # 1 day ahead residuals +#' step_epi_ahead(death_rate, ahead = 1L, skip = TRUE) %>% +#' recipes::update_role(death_rate, new_role = "predictor") %>% +#' recipes::add_role(time_value, geo_value, new_role = "predictor") +#' +#' forecast_date <- max(case_death_rate_subset$time_value) +#' +#' latest <- get_test_data( +#' epi_recipe(case_death_rate_subset), case_death_rate_subset +#' ) +#' +#' f <- frosting() %>% +#' layer_predict() %>% +#' layer_cdc_flatline_quantiles(aheads = c(7, 14, 21, 28), symmetrize = TRUE) +#' +#' eng <- parsnip::linear_reg() %>% parsnip::set_engine("flatline") +#' +#' wf <- epi_workflow(r, eng, f) %>% fit(case_death_rate_subset) +#' preds <- suppressWarnings(predict(wf, new_data = latest)) %>% +#' dplyr::select(-time_value) %>% +#' dplyr::mutate(forecast_date = forecast_date) +#' preds +#' +#' preds <- preds %>% +#' unnest(.pred_distn_all) %>% +#' pivot_quantiles(.pred_distn) %>% +#' mutate(target_date = forecast_date + ahead) +#' +#' library(ggplot2) +#' four_states <- c("ca", "pa", "wa", "ny") +#' preds %>% +#' filter(geo_value %in% four_states) %>% +#' ggplot(aes(target_date)) + +#' geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + +#' geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + +#' geom_line(aes(y = .pred), color = "orange") + +#' geom_line(data = case_death_rate_subset %>% filter(geo_value %in% four_states), +#' aes(x = time_value, y = death_rate)) + +#' scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + +#' labs(x = "Date", y = "Death rate") + +#' facet_wrap(~geo_value, scales = "free_y") + +#' theme_bw() + +#' geom_vline(xintercept = forecast_date) +#' +#' layer_cdc_flatline_quantiles <- function( frosting, ..., diff --git a/man/add_frosting.Rd b/man/add_frosting.Rd index d7d217777..4d77572a1 100644 --- a/man/add_frosting.Rd +++ b/man/add_frosting.Rd @@ -35,7 +35,9 @@ latest <- jhu \%>\% dplyr::filter(time_value >= max(time_value) - 14) # Add frosting to a workflow and predict -f <- frosting() \%>\% layer_predict() \%>\% layer_naomit(.pred) +f <- frosting() \%>\% + layer_predict() \%>\% + layer_naomit(.pred) wf1 <- wf \%>\% add_frosting(f) p1 <- predict(wf1, latest) p1 diff --git a/man/arx_fcast_epi_workflow.Rd b/man/arx_fcast_epi_workflow.Rd index fdd309959..7a6b66305 100644 --- a/man/arx_fcast_epi_workflow.Rd +++ b/man/arx_fcast_epi_workflow.Rd @@ -41,12 +41,16 @@ use \code{\link[=quantile_reg]{quantile_reg()}}) but can be omitted. jhu <- case_death_rate_subset \%>\% dplyr::filter(time_value >= as.Date("2021-12-01")) -arx_fcast_epi_workflow(jhu, "death_rate", - c("case_rate", "death_rate")) +arx_fcast_epi_workflow( + jhu, "death_rate", + c("case_rate", "death_rate") +) arx_fcast_epi_workflow(jhu, "death_rate", - c("case_rate", "death_rate"), trainer = quantile_reg(), - args_list = arx_args_list(levels = 1:9 / 10)) + c("case_rate", "death_rate"), + trainer = quantile_reg(), + args_list = arx_args_list(levels = 1:9 / 10) +) } \seealso{ \code{\link[=arx_forecaster]{arx_forecaster()}} diff --git a/man/arx_forecaster.Rd b/man/arx_forecaster.Rd index d4866aa0e..e121f272c 100644 --- a/man/arx_forecaster.Rd +++ b/man/arx_forecaster.Rd @@ -41,12 +41,16 @@ that it estimates a model for a particular target horizon. jhu <- case_death_rate_subset \%>\% dplyr::filter(time_value >= as.Date("2021-12-01")) -out <- arx_forecaster(jhu, "death_rate", - c("case_rate", "death_rate")) +out <- arx_forecaster( + jhu, "death_rate", + c("case_rate", "death_rate") +) out <- arx_forecaster(jhu, "death_rate", - c("case_rate", "death_rate"), trainer = quantile_reg(), - args_list = arx_args_list(levels = 1:9 / 10)) + c("case_rate", "death_rate"), + trainer = quantile_reg(), + args_list = arx_args_list(levels = 1:9 / 10) +) } \seealso{ \code{\link[=arx_fcast_epi_workflow]{arx_fcast_epi_workflow()}}, \code{\link[=arx_args_list]{arx_args_list()}} diff --git a/man/create_layer.Rd b/man/create_layer.Rd index 399d62efa..d36385fb2 100644 --- a/man/create_layer.Rd +++ b/man/create_layer.Rd @@ -20,9 +20,9 @@ fill in the name of the layer, and open the file. \examples{ \dontrun{ - # Note: running this will write `layer_strawberry.R` to - # the `R/` directory of your current project - create_layer("strawberry") +# Note: running this will write `layer_strawberry.R` to +# the `R/` directory of your current project +create_layer("strawberry") } } diff --git a/man/dist_quantiles.Rd b/man/dist_quantiles.Rd index 50f00dc32..739bae5a8 100644 --- a/man/dist_quantiles.Rd +++ b/man/dist_quantiles.Rd @@ -15,7 +15,7 @@ dist_quantiles(x, tau) A distribution parameterized by a set of quantiles } \examples{ -dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2,.4,.6,.8))) +dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2, .4, .6, .8))) quantile(dstn, p = c(.1, .25, .5, .9)) median(dstn) diff --git a/man/extrapolate_quantiles.Rd b/man/extrapolate_quantiles.Rd index 985d7cae8..cc6cb2c3c 100644 --- a/man/extrapolate_quantiles.Rd +++ b/man/extrapolate_quantiles.Rd @@ -24,12 +24,14 @@ library(distributional) dstn <- dist_normal(c(10, 2), c(5, 10)) extrapolate_quantiles(dstn, p = c(.25, 0.5, .75)) -dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2,.4,.6,.8))) +dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2, .4, .6, .8))) # because this distribution is already quantiles, any extra quantiles are # appended extrapolate_quantiles(dstn, p = c(.25, 0.5, .75)) -dstn <- c(dist_normal(c(10, 2), c(5, 10)), - dist_quantiles(list(1:4, 8:11), list(c(.2,.4,.6,.8)))) +dstn <- c( + dist_normal(c(10, 2), c(5, 10)), + dist_quantiles(list(1:4, 8:11), list(c(.2, .4, .6, .8))) +) extrapolate_quantiles(dstn, p = c(.25, 0.5, .75)) } diff --git a/man/fit-epi_workflow.Rd b/man/fit-epi_workflow.Rd index fb1c3af28..3dfa0029a 100644 --- a/man/fit-epi_workflow.Rd +++ b/man/fit-epi_workflow.Rd @@ -29,7 +29,7 @@ preprocessing the data and fitting the underlying parsnip model. } \examples{ jhu <- case_death_rate_subset \%>\% -filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) + filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% diff --git a/man/flatline.Rd b/man/flatline.Rd index a396cfeb9..c353ff163 100644 --- a/man/flatline.Rd +++ b/man/flatline.Rd @@ -38,8 +38,10 @@ This is an internal function that is used to create a \code{\link[parsnip:linear model. It has somewhat odd behaviour (see below). } \examples{ -tib <- data.frame(y = runif(100), - expand.grid(k = letters[1:4], j = letters[5:9], time_value = 1:5)) \%>\% +tib <- data.frame( + y = runif(100), + expand.grid(k = letters[1:4], j = letters[5:9], time_value = 1:5) +) \%>\% dplyr::group_by(k, j) \%>\% dplyr::mutate(y2 = dplyr::lead(y, 2)) # predict 2 steps ahead flat <- flatline(y2 ~ j + k + y, tib) # predictions for 20 locations diff --git a/man/flatline_args_list.Rd b/man/flatline_args_list.Rd index 55d93c1db..dcae448f1 100644 --- a/man/flatline_args_list.Rd +++ b/man/flatline_args_list.Rd @@ -17,8 +17,12 @@ flatline_args_list( ) } \arguments{ -\item{ahead}{Integer. Number of time steps ahead (in days) of the forecast -date for which forecasts should be produced.} +\item{ahead}{Integer. Unlike \code{\link[=arx_forecaster]{arx_forecaster()}}, this doesn't have any effect +on the predicted values. Predictions are always the most recent observation. +However, this \emph{does} impact the residuals stored in the object. Residuals +are calculated based on this number to mimic how badly you would have done. +So for example, \code{ahead = 7} will create residuals by comparing values +7 days apart.} \item{n_training}{Integer. An upper limit for the number of rows per key that are used for training diff --git a/man/frosting.Rd b/man/frosting.Rd index 83a8d6a9d..362c40a4f 100644 --- a/man/frosting.Rd +++ b/man/frosting.Rd @@ -24,8 +24,8 @@ The arguments are currently placeholders and must be NULL \examples{ # Toy example to show that frosting can be created and added for postprocessing - f <- frosting() - wf <- epi_workflow() \%>\% add_frosting(f) +f <- frosting() +wf <- epi_workflow() \%>\% add_frosting(f) # A more realistic example jhu <- case_death_rate_subset \%>\% diff --git a/man/layer_add_forecast_date.Rd b/man/layer_add_forecast_date.Rd index 421978eb5..4e173d662 100644 --- a/man/layer_add_forecast_date.Rd +++ b/man/layer_add_forecast_date.Rd @@ -46,15 +46,17 @@ wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) latest <- jhu \%>\% dplyr::filter(time_value >= max(time_value) - 14) - # Don't specify `forecast_date` (by default, this should be last date in latest) -f <- frosting() \%>\% layer_predict() \%>\% - layer_naomit(.pred) +# Don't specify `forecast_date` (by default, this should be last date in latest) +f <- frosting() \%>\% + layer_predict() \%>\% + layer_naomit(.pred) wf0 <- wf \%>\% add_frosting(f) p0 <- predict(wf0, latest) p0 # Specify a `forecast_date` that is greater than or equal to `as_of` date -f <- frosting() \%>\% layer_predict() \%>\% +f <- frosting() \%>\% + layer_predict() \%>\% layer_add_forecast_date(forecast_date = "2022-05-31") \%>\% layer_naomit(.pred) wf1 <- wf \%>\% add_frosting(f) @@ -73,7 +75,7 @@ p2 <- predict(wf2, latest) p2 # Do not specify a forecast_date - f3 <- frosting() \%>\% +f3 <- frosting() \%>\% layer_predict() \%>\% layer_add_forecast_date() \%>\% layer_naomit(.pred) diff --git a/man/layer_add_target_date.Rd b/man/layer_add_target_date.Rd index 58ff7770f..3c2884e10 100644 --- a/man/layer_add_target_date.Rd +++ b/man/layer_add_target_date.Rd @@ -48,7 +48,8 @@ wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) latest <- get_test_data(r, jhu) # Use ahead + forecast date -f <- frosting() \%>\% layer_predict() \%>\% +f <- frosting() \%>\% + layer_predict() \%>\% layer_add_forecast_date(forecast_date = "2022-05-31") \%>\% layer_add_target_date() \%>\% layer_naomit(.pred) @@ -59,7 +60,8 @@ p # Use ahead + max time value from pre, fit, post # which is the same if include `layer_add_forecast_date()` -f2 <- frosting() \%>\% layer_predict() \%>\% +f2 <- frosting() \%>\% + layer_predict() \%>\% layer_add_target_date() \%>\% layer_naomit(.pred) wf2 <- wf \%>\% add_frosting(f2) diff --git a/man/layer_cdc_flatline_quantiles.Rd b/man/layer_cdc_flatline_quantiles.Rd new file mode 100644 index 000000000..c5bb33d3b --- /dev/null +++ b/man/layer_cdc_flatline_quantiles.Rd @@ -0,0 +1,125 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/layer_cdc_flatline_quantiles.R +\name{layer_cdc_flatline_quantiles} +\alias{layer_cdc_flatline_quantiles} +\title{CDC Flatline Forecast Quantiles} +\usage{ +layer_cdc_flatline_quantiles( + frosting, + ..., + aheads = 1:4, + quantiles = c(0.01, 0.025, 1:19/20, 0.975, 0.99), + nsims = 1000, + by_key = "geo_value", + symmetrize = FALSE, + nonneg = TRUE, + id = rand_id("cdc_baseline_bands") +) +} +\arguments{ +\item{frosting}{a \code{frosting} postprocessor} + +\item{...}{Unused, include for consistency with other layers.} + +\item{aheads}{Numeric vector of desired forecast horizons. These should be +given in the "units of the training data". So, for example, for data +typically observed daily (possibly with missing values), but +with weekly forecast targets, you would use \code{c(7, 14, 21, 28)}. But with +weekly data, you would use \code{1:4}.} + +\item{quantiles}{Numeric vector of probabilities with values in (0,1) +referring to the desired predictive intervals. The default is the standard +set for the COVID Forecast Hub.} + +\item{nsims}{Positive integer. The number of draws from the empirical CDF. +These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting +in linear interpolation on the X scale. This is achieved with +\code{\link[stats:quantile]{stats::quantile()}} Type 7 (the default for that function).} + +\item{by_key}{A character vector of keys to group the residuals by before +calculating quantiles. The default, \code{c()} performs no grouping.} + +\item{symmetrize}{logical. If \code{TRUE} then interval will be symmetric.} + +\item{nonneg}{Logical. Force all predictive intervals be non-negative. +Because non-negativity is forced \emph{before} propagating forward, this +has slightly different behaviour than would occur if using +\code{\link[=layer_threshold_preds]{layer_threshold_preds()}}.} + +\item{id}{a random id string} +} +\value{ +an updated \code{frosting} postprocessor. Calling \code{\link[=predict]{predict()}} will result +in an additional \verb{} named \code{.pred_distn_all} containing 2-column +\code{\link[tibble:tibble]{tibble::tibble()}}'s. For each +desired combination of \code{key}'s, the tibble will contain one row per ahead +with the associated \code{\link[=dist_quantiles]{dist_quantiles()}}. +} +\description{ +This layer creates quantile forecasts by taking a sample from the +interpolated CDF of the flatline residuals, and shuffling them. +These are then added on to the point prediction. +} +\details{ +This layer is intended to be used in concert with \code{\link[=flatline]{flatline()}}. But it can +also be used with anything else. As long as residuals are available in the +the fitted model, this layer could be useful. Like +\code{\link[=layer_residual_quantiles]{layer_residual_quantiles()}} it only uses the residuals for the fitted model +object. However, it propagates these forward for \emph{all} aheads, by +iteratively shuffling them (randomly), and then adding them to the previous +set. This is in contrast to what happens with the \code{\link[=flatline_forecaster]{flatline_forecaster()}}. +When using \code{\link[=flatline]{flatline()}} as the underlying engine (here), both will result in the +same predictions (the most recent observed value), but that model calculates +separate residuals for each \code{ahead} by comparing to observations further into +the future. This version continues to use the same set of residuals, and +adds them on to produce wider intervals as \code{ahead} increases. +} +\examples{ +r <- epi_recipe(case_death_rate_subset) \%>\% + # data is "daily", so we fit this to 1 ahead, the result will contain + # 1 day ahead residuals + step_epi_ahead(death_rate, ahead = 1L, skip = TRUE) \%>\% + recipes::update_role(death_rate, new_role = "predictor") \%>\% + recipes::add_role(time_value, geo_value, new_role = "predictor") + +forecast_date <- max(case_death_rate_subset$time_value) + +latest <- get_test_data( + epi_recipe(case_death_rate_subset), case_death_rate_subset +) + +f <- frosting() \%>\% + layer_predict() \%>\% + layer_cdc_flatline_quantiles(aheads = c(7, 14, 21, 28), symmetrize = TRUE) + +eng <- parsnip::linear_reg() \%>\% parsnip::set_engine("flatline") + +wf <- epi_workflow(r, eng, f) \%>\% fit(case_death_rate_subset) +preds <- suppressWarnings(predict(wf, new_data = latest)) \%>\% + dplyr::select(-time_value) \%>\% + dplyr::mutate(forecast_date = forecast_date) +preds + +preds <- preds \%>\% + unnest(.pred_distn_all) \%>\% + pivot_quantiles(.pred_distn) \%>\% + mutate(target_date = forecast_date + ahead) + +library(ggplot2) +four_states <- c("ca", "pa", "wa", "ny") +preds \%>\% + filter(geo_value \%in\% four_states) \%>\% + ggplot(aes(target_date)) + + geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + + geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + + geom_line(aes(y = .pred), color = "orange") + + geom_line(data = case_death_rate_subset \%>\% filter(geo_value \%in\% four_states), + aes(x = time_value, y = death_rate)) + + scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + + labs(x = "Date", y = "Death rate") + + facet_wrap(~geo_value, scales = "free_y") + + theme_bw() + + geom_vline(xintercept = forecast_date) + + +} diff --git a/man/layer_population_scaling.Rd b/man/layer_population_scaling.Rd index e841e9a50..179d6862c 100644 --- a/man/layer_population_scaling.Rd +++ b/man/layer_population_scaling.Rd @@ -78,13 +78,15 @@ jhu <- epiprocess::jhu_csse_daily_subset \%>\% dplyr::filter(time_value > "2021-11-01", geo_value \%in\% c("ca", "ny")) \%>\% dplyr::select(geo_value, time_value, cases) -pop_data = data.frame(states = c("ca", "ny"), value = c(20000, 30000)) +pop_data <- data.frame(states = c("ca", "ny"), value = c(20000, 30000)) r <- epi_recipe(jhu) \%>\% - step_population_scaling(df = pop_data, - df_pop_col = "value", - by = c("geo_value" = "states"), - cases, suffix = "_scaled") \%>\% + step_population_scaling( + df = pop_data, + df_pop_col = "value", + by = c("geo_value" = "states"), + cases, suffix = "_scaled" + ) \%>\% step_epi_lag(cases_scaled, lag = c(0, 7, 14)) \%>\% step_epi_ahead(cases_scaled, ahead = 7, role = "outcome") \%>\% step_epi_naomit() @@ -93,9 +95,11 @@ f <- frosting() \%>\% layer_predict() \%>\% layer_threshold(.pred) \%>\% layer_naomit(.pred) \%>\% - layer_population_scaling(.pred, df = pop_data, - by = c("geo_value" = "states"), - df_pop_col = "value") + layer_population_scaling(.pred, + df = pop_data, + by = c("geo_value" = "states"), + df_pop_col = "value" + ) wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) \%>\% @@ -104,9 +108,12 @@ wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% latest <- get_test_data( recipe = r, x = epiprocess::jhu_csse_daily_subset \%>\% - dplyr::filter(time_value > "2021-11-01", - geo_value \%in\% c("ca", "ny")) \%>\% - dplyr::select(geo_value, time_value, cases)) + dplyr::filter( + time_value > "2021-11-01", + geo_value \%in\% c("ca", "ny") + ) \%>\% + dplyr::select(geo_value, time_value, cases) +) predict(wf, latest) } diff --git a/man/layer_predict.Rd b/man/layer_predict.Rd index 1326dfe75..03473053f 100644 --- a/man/layer_predict.Rd +++ b/man/layer_predict.Rd @@ -62,9 +62,9 @@ jhu <- case_death_rate_subset \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% - step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% - step_epi_ahead(death_rate, ahead = 7) \%>\% - step_epi_naomit() + step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% + step_epi_ahead(death_rate, ahead = 7) \%>\% + step_epi_naomit() wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) latest <- jhu \%>\% filter(time_value >= max(time_value) - 14) diff --git a/man/nested_quantiles.Rd b/man/nested_quantiles.Rd index 1a2824041..c4b578c1a 100644 --- a/man/nested_quantiles.Rd +++ b/man/nested_quantiles.Rd @@ -16,8 +16,8 @@ a list-col Turn a vector of quantile distributions into a list-col } \examples{ -edf <- case_death_rate_subset[1:3,] -edf$q <- dist_quantiles(list(1:5, 2:4, 3:10), list(1:5/6, 2:4/5, 3:10/11)) +edf <- case_death_rate_subset[1:3, ] +edf$q <- dist_quantiles(list(1:5, 2:4, 3:10), list(1:5 / 6, 2:4 / 5, 3:10 / 11)) edf_nested <- edf \%>\% dplyr::mutate(q = nested_quantiles(q)) edf_nested \%>\% tidyr::unnest(q) diff --git a/man/smooth_quantile_reg.Rd b/man/smooth_quantile_reg.Rd index 293999876..6cc2dfc82 100644 --- a/man/smooth_quantile_reg.Rd +++ b/man/smooth_quantile_reg.Rd @@ -39,9 +39,10 @@ only supported engine is \code{\link[smoothqr:smooth_qr]{smoothqr::smooth_qr()}} tib <- data.frame( y1 = rnorm(100), y2 = rnorm(100), y3 = rnorm(100), y4 = rnorm(100), y5 = rnorm(100), y6 = rnorm(100), - x1 = rnorm(100), x2 = rnorm(100)) + x1 = rnorm(100), x2 = rnorm(100) +) qr_spec <- smooth_quantile_reg(tau = c(.2, .5, .8), outcome_locations = 1:6) -ff <- qr_spec \%>\% fit(cbind(y1, y2 , y3 , y4 , y5 , y6) ~ ., data = tib) +ff <- qr_spec \%>\% fit(cbind(y1, y2, y3, y4, y5, y6) ~ ., data = tib) p <- predict(ff, new_data = tib) x <- -99:99 / 100 * 2 * pi @@ -50,21 +51,23 @@ fd <- x[length(x) - 20] XY <- smoothqr::lagmat(y[1:(length(y) - 20)], c(-20:20)) XY <- tibble::as_tibble(XY) qr_spec <- smooth_quantile_reg(tau = c(.2, .5, .8), outcome_locations = 20:1) -tt <- qr_spec \%>\% fit_xy(x = XY[,21:41], y = XY[,1:20]) +tt <- qr_spec \%>\% fit_xy(x = XY[, 21:41], y = XY[, 1:20]) library(tidyr) library(dplyr) pl <- predict( - object = tt, - new_data = XY[max(which(complete.cases(XY[,21:41]))), 21:41] - ) + object = tt, + new_data = XY[max(which(complete.cases(XY[, 21:41]))), 21:41] +) pl <- pl \%>\% - unnest(.pred) \%>\% - mutate(distn = nested_quantiles(distn)) \%>\% - unnest(distn) \%>\% - mutate(x = x[length(x) - 20] + ahead / 100 * 2 * pi, - ahead = NULL) \%>\% - pivot_wider(names_from = tau, values_from = q) + unnest(.pred) \%>\% + mutate(distn = nested_quantiles(distn)) \%>\% + unnest(distn) \%>\% + mutate( + x = x[length(x) - 20] + ahead / 100 * 2 * pi, + ahead = NULL + ) \%>\% + pivot_wider(names_from = tau, values_from = q) plot(x, y, pch = 16, xlim = c(pi, 2 * pi), col = "lightgrey") curve(sin(x), add = TRUE) abline(v = fd, lty = 2) diff --git a/man/step_epi_shift.Rd b/man/step_epi_shift.Rd index ca8609b1e..bf135346e 100644 --- a/man/step_epi_shift.Rd +++ b/man/step_epi_shift.Rd @@ -90,7 +90,7 @@ are always set to \code{"ahead_"} and \code{"epi_ahead"} respectively, while for \examples{ r <- epi_recipe(case_death_rate_subset) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% - step_epi_lag(death_rate, lag = c(0,7,14)) + step_epi_lag(death_rate, lag = c(0, 7, 14)) r } \seealso{ diff --git a/man/step_growth_rate.Rd b/man/step_growth_rate.Rd index 0449b887c..b409135b1 100644 --- a/man/step_growth_rate.Rd +++ b/man/step_growth_rate.Rd @@ -87,7 +87,9 @@ r <- epi_recipe(case_death_rate_subset) \%>\% step_growth_rate(case_rate, death_rate) r -r \%>\% recipes::prep() \%>\% recipes::bake(case_death_rate_subset) +r \%>\% + recipes::prep() \%>\% + recipes::bake(case_death_rate_subset) } \seealso{ Other row operation steps: diff --git a/man/step_lag_difference.Rd b/man/step_lag_difference.Rd index d69c25faa..b06abe43c 100644 --- a/man/step_lag_difference.Rd +++ b/man/step_lag_difference.Rd @@ -59,7 +59,9 @@ r <- epi_recipe(case_death_rate_subset) \%>\% step_lag_difference(case_rate, death_rate, horizon = c(7, 14)) r -r \%>\% recipes::prep() \%>\% recipes::bake(case_death_rate_subset) +r \%>\% + recipes::prep() \%>\% + recipes::bake(case_death_rate_subset) } \seealso{ Other row operation steps: diff --git a/man/step_population_scaling.Rd b/man/step_population_scaling.Rd index 2964c6912..1a9564563 100644 --- a/man/step_population_scaling.Rd +++ b/man/step_population_scaling.Rd @@ -104,13 +104,15 @@ jhu <- epiprocess::jhu_csse_daily_subset \%>\% dplyr::filter(time_value > "2021-11-01", geo_value \%in\% c("ca", "ny")) \%>\% dplyr::select(geo_value, time_value, cases) -pop_data = data.frame(states = c("ca", "ny"), value = c(20000, 30000)) +pop_data <- data.frame(states = c("ca", "ny"), value = c(20000, 30000)) r <- epi_recipe(jhu) \%>\% - step_population_scaling(df = pop_data, - df_pop_col = "value", - by = c("geo_value" = "states"), - cases, suffix = "_scaled") \%>\% + step_population_scaling( + df = pop_data, + df_pop_col = "value", + by = c("geo_value" = "states"), + cases, suffix = "_scaled" + ) \%>\% step_epi_lag(cases_scaled, lag = c(0, 7, 14)) \%>\% step_epi_ahead(cases_scaled, ahead = 7, role = "outcome") \%>\% step_epi_naomit() @@ -119,9 +121,11 @@ f <- frosting() \%>\% layer_predict() \%>\% layer_threshold(.pred) \%>\% layer_naomit(.pred) \%>\% - layer_population_scaling(.pred, df = pop_data, - by = c("geo_value" = "states"), - df_pop_col = "value") + layer_population_scaling(.pred, + df = pop_data, + by = c("geo_value" = "states"), + df_pop_col = "value" + ) wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) \%>\% @@ -130,8 +134,10 @@ wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% latest <- get_test_data( recipe = r, epiprocess::jhu_csse_daily_subset \%>\% - dplyr::filter(time_value > "2021-11-01", - geo_value \%in\% c("ca", "ny")) \%>\% + dplyr::filter( + time_value > "2021-11-01", + geo_value \%in\% c("ca", "ny") + ) \%>\% dplyr::select(geo_value, time_value, cases) ) diff --git a/man/step_training_window.Rd b/man/step_training_window.Rd index 7861f27ea..ce7c0fc74 100644 --- a/man/step_training_window.Rd +++ b/man/step_training_window.Rd @@ -50,9 +50,12 @@ after any filtering step. tib <- tibble::tibble( x = 1:10, y = 1:10, - time_value = rep(seq(as.Date("2020-01-01"), by = 1, - length.out = 5), times = 2), - geo_value = rep(c("ca", "hi"), each = 5)) \%>\% + time_value = rep(seq(as.Date("2020-01-01"), + by = 1, + length.out = 5 + ), times = 2), + geo_value = rep(c("ca", "hi"), each = 5) +) \%>\% as_epi_df() epi_recipe(y ~ x, data = tib) \%>\% From 237ec50ddd74d577184f68cad39a794e91468110 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sun, 24 Sep 2023 16:42:59 -0700 Subject: [PATCH 07/22] run styler --- R/layer_cdc_flatline_quantiles.R | 24 +++++++++++++--------- R/step_growth_rate.R | 27 ++++++++++++------------- R/step_lag_difference.R | 19 +++++++++-------- tests/testthat/test-propogate_samples.R | 1 - 4 files changed, 36 insertions(+), 35 deletions(-) diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index afee37577..1881b8523 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -83,15 +83,16 @@ #' geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + #' geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + #' geom_line(aes(y = .pred), color = "orange") + -#' geom_line(data = case_death_rate_subset %>% filter(geo_value %in% four_states), -#' aes(x = time_value, y = death_rate)) + +#' geom_line( +#' data = case_death_rate_subset %>% filter(geo_value %in% four_states), +#' aes(x = time_value, y = death_rate) +#' ) + #' scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + #' labs(x = "Date", y = "Death rate") + #' facet_wrap(~geo_value, scales = "free_y") + #' theme_bw() + #' geom_vline(xintercept = forecast_date) #' -#' layer_cdc_flatline_quantiles <- function( frosting, ..., @@ -102,7 +103,6 @@ layer_cdc_flatline_quantiles <- function( symmetrize = FALSE, nonneg = TRUE, id = rand_id("cdc_baseline_bands")) { - rlang::check_dots_empty() arg_is_int(aheads) @@ -134,8 +134,7 @@ layer_cdc_flatline_quantiles_new <- function( by_key, symmetrize, nonneg, - id -) { + id) { layer( "cdc_flatline_quantiles", aheads = aheads, @@ -154,8 +153,10 @@ slather.layer_cdc_flatline_quantiles <- the_fit <- workflows::extract_fit_parsnip(workflow) if (!inherits(the_fit, "_flatline")) { cli::cli_warn( - c("Predictions for this workflow were not produced by the {.cls flatline}", - "{.pkg parsnip} engine. Results may be unexpected. See {.fn epipredict::flatline}.") + c( + "Predictions for this workflow were not produced by the {.cls flatline}", + "{.pkg parsnip} engine. Results may be unexpected. See {.fn epipredict::flatline}." + ) ) } p <- components$predictions @@ -240,8 +241,11 @@ propogate_samples <- function( for (iter in 2:max_ahead) { samp <- shuffle(samp) raw <- raw + samp - if (symmetrize) symmetric <- raw - (median(raw) - p) - else symmetric <- raw + if (symmetrize) { + symmetric <- raw - (median(raw) - p) + } else { + symmetric <- raw + } if (nonneg) symmetric <- pmax(0, symmetric) res[[iter]] <- symmetric } diff --git a/R/step_growth_rate.R b/R/step_growth_rate.R index f6ad29a5b..74cfff284 100644 --- a/R/step_growth_rate.R +++ b/R/step_growth_rate.R @@ -42,20 +42,19 @@ #' recipes::prep() %>% #' recipes::bake(case_death_rate_subset) step_growth_rate <- - function( - recipe, - ..., - role = "predictor", - trained = FALSE, - horizon = 7, - method = c("rel_change", "linear_reg", "smooth_spline", "trend_filter"), - log_scale = FALSE, - replace_Inf = NA, - prefix = "gr_", - columns = NULL, - skip = FALSE, - id = rand_id("growth_rate"), - additional_gr_args_list = list()) { + function(recipe, + ..., + role = "predictor", + trained = FALSE, + horizon = 7, + method = c("rel_change", "linear_reg", "smooth_spline", "trend_filter"), + log_scale = FALSE, + replace_Inf = NA, + prefix = "gr_", + columns = NULL, + skip = FALSE, + id = rand_id("growth_rate"), + additional_gr_args_list = list()) { if (!is_epi_recipe(recipe)) { rlang::abort("This recipe step can only operate on an `epi_recipe`.") } diff --git a/R/step_lag_difference.R b/R/step_lag_difference.R index 2482be46a..21878eaa7 100644 --- a/R/step_lag_difference.R +++ b/R/step_lag_difference.R @@ -23,16 +23,15 @@ #' recipes::prep() %>% #' recipes::bake(case_death_rate_subset) step_lag_difference <- - function( - recipe, - ..., - role = "predictor", - trained = FALSE, - horizon = 7, - prefix = "lag_diff_", - columns = NULL, - skip = FALSE, - id = rand_id("lag_diff")) { + function(recipe, + ..., + role = "predictor", + trained = FALSE, + horizon = 7, + prefix = "lag_diff_", + columns = NULL, + skip = FALSE, + id = rand_id("lag_diff")) { if (!is_epi_recipe(recipe)) { rlang::abort("This recipe step can only operate on an `epi_recipe`.") } diff --git a/tests/testthat/test-propogate_samples.R b/tests/testthat/test-propogate_samples.R index 3b02404b6..b8a1ff82d 100644 --- a/tests/testthat/test-propogate_samples.R +++ b/tests/testthat/test-propogate_samples.R @@ -4,5 +4,4 @@ test_that("propogate_samples", { quantiles <- 1:9 / 10 aheads <- c(2, 4, 7) nsim <- 100 - }) From c13b83eb1371ebd0668ded2d6eb0495b17c4dd9e Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sun, 24 Sep 2023 16:43:26 -0700 Subject: [PATCH 08/22] redocument after styling --- man/layer_cdc_flatline_quantiles.Rd | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/man/layer_cdc_flatline_quantiles.Rd b/man/layer_cdc_flatline_quantiles.Rd index c5bb33d3b..4f151e854 100644 --- a/man/layer_cdc_flatline_quantiles.Rd +++ b/man/layer_cdc_flatline_quantiles.Rd @@ -113,13 +113,14 @@ preds \%>\% geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + geom_line(aes(y = .pred), color = "orange") + - geom_line(data = case_death_rate_subset \%>\% filter(geo_value \%in\% four_states), - aes(x = time_value, y = death_rate)) + + geom_line( + data = case_death_rate_subset \%>\% filter(geo_value \%in\% four_states), + aes(x = time_value, y = death_rate) + ) + scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + labs(x = "Date", y = "Death rate") + facet_wrap(~geo_value, scales = "free_y") + theme_bw() + geom_vline(xintercept = forecast_date) - } From 16f6c2c5da3fb906b0152348c29a328ada0ace55 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 25 Sep 2023 10:04:36 -0700 Subject: [PATCH 09/22] example plotting with ggplot2 handled correctly --- R/layer_cdc_flatline_quantiles.R | 4 ++-- R/make_smooth_quantile_reg.R | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index 1881b8523..7ff224359 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -75,7 +75,7 @@ #' pivot_quantiles(.pred_distn) %>% #' mutate(target_date = forecast_date + ahead) #' -#' library(ggplot2) +#' if (require("ggplot2")) { #' four_states <- c("ca", "pa", "wa", "ny") #' preds %>% #' filter(geo_value %in% four_states) %>% @@ -92,7 +92,7 @@ #' facet_wrap(~geo_value, scales = "free_y") + #' theme_bw() + #' geom_vline(xintercept = forecast_date) -#' +#' } layer_cdc_flatline_quantiles <- function( frosting, ..., diff --git a/R/make_smooth_quantile_reg.R b/R/make_smooth_quantile_reg.R index 6eab2a132..cfb08a9c7 100644 --- a/R/make_smooth_quantile_reg.R +++ b/R/make_smooth_quantile_reg.R @@ -61,7 +61,8 @@ #' lines(pl$x, pl$`0.2`, col = "blue") #' lines(pl$x, pl$`0.8`, col = "blue") #' lines(pl$x, pl$`0.5`, col = "red") -#' \dontrun{ +#' +#' if (require("ggplot2")) { #' ggplot(data.frame(x = x, y = y), aes(x)) + #' geom_ribbon(data = pl, aes(ymin = `0.2`, ymax = `0.8`), fill = "lightblue") + #' geom_point(aes(y = y), colour = "grey") + # observed data From ce0b1808f23f86e1d2b736dea8e523d0b068d0de Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 25 Sep 2023 12:33:18 -0700 Subject: [PATCH 10/22] finish quantile pivotting helpers, redocument --- NAMESPACE | 3 +- NEWS.md | 2 +- R/dist_quantiles.R | 87 ---------- R/pivot_quantiles.R | 157 ++++++++++++++++++ _pkgdown.yml | 2 +- man/add_frosting.Rd | 4 +- man/arx_fcast_epi_workflow.Rd | 12 +- man/arx_forecaster.Rd | 12 +- man/create_layer.Rd | 6 +- man/dist_quantiles.Rd | 2 +- man/extrapolate_quantiles.Rd | 8 +- man/fit-epi_workflow.Rd | 2 +- man/flatline.Rd | 6 +- man/frosting.Rd | 4 +- man/layer_add_forecast_date.Rd | 12 +- man/layer_add_target_date.Rd | 6 +- man/layer_population_scaling.Rd | 29 ++-- man/layer_predict.Rd | 6 +- man/nested_quantiles.Rd | 6 +- man/pivot_quantiles_longer.Rd | 42 +++++ ..._quantiles.Rd => pivot_quantiles_wider.Rd} | 16 +- man/smooth_quantile_reg.Rd | 27 +-- man/step_epi_shift.Rd | 2 +- man/step_growth_rate.Rd | 4 +- man/step_lag_difference.Rd | 4 +- man/step_population_scaling.Rd | 26 +-- man/step_training_window.Rd | 9 +- tests/testthat/test-pivot_quantiles.R | 58 ++++++- 28 files changed, 375 insertions(+), 179 deletions(-) create mode 100644 R/pivot_quantiles.R create mode 100644 man/pivot_quantiles_longer.Rd rename man/{pivot_quantiles.Rd => pivot_quantiles_wider.Rd} (75%) diff --git a/NAMESPACE b/NAMESPACE index b22ec53a5..c97dc9018 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -143,7 +143,8 @@ export(layer_unnest) export(nested_quantiles) export(new_default_epi_recipe_blueprint) export(new_epi_recipe_blueprint) -export(pivot_quantiles) +export(pivot_quantiles_longer) +export(pivot_quantiles_wider) export(prep) export(quantile_reg) export(remove_frosting) diff --git a/NEWS.md b/NEWS.md index fa99c8bcd..12442639b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -7,7 +7,7 @@ * canned forecasters get a class * fixed quantile bug in `flatline_forecaster()` * add functionality to output the unfit workflow from the canned forecasters -* add `pivot_quantiles()` for easier plotting +* add `pivot_quantiles_wider()` for easier plotting # epipredict 0.0.4 diff --git a/R/dist_quantiles.R b/R/dist_quantiles.R index 032a4d96c..ff14d6733 100644 --- a/R/dist_quantiles.R +++ b/R/dist_quantiles.R @@ -116,93 +116,6 @@ is_dist_quantiles <- function(x) { } -#' Turn a vector of quantile distributions into a list-col -#' -#' @param x a `distribution` containing `dist_quantiles` -#' -#' @return a list-col -#' @export -#' -#' @examples -#' edf <- case_death_rate_subset[1:3, ] -#' edf$q <- dist_quantiles(list(1:5, 2:4, 3:10), list(1:5 / 6, 2:4 / 5, 3:10 / 11)) -#' -#' edf_nested <- edf %>% dplyr::mutate(q = nested_quantiles(q)) -#' edf_nested %>% tidyr::unnest(q) -nested_quantiles <- function(x) { - stopifnot(is_dist_quantiles(x)) - distributional:::dist_apply(x, .f = function(z) { - tibble::as_tibble(vec_data(z)) %>% - dplyr::mutate(dplyr::across(tidyselect::everything(), as.double)) %>% - list_of() - }) -} - - -#' Pivot columns containing `dist_quantile` wider -#' -#' Any selected columns that contain `dist_quantiles` will be "widened" with -#' the "taus" (quantile) serving as names and the values in the data frame. -#' When pivoting multiple columns, the original column name will be used as -#' a prefix. -#' -#' @param .data A data frame, or a data frame extension such as a tibble or -#' epi_df. -#' @param ... <[`tidy-select`][dplyr::dplyr_tidy_select]> One or more unquoted -#' expressions separated by commas. Variable names can be used as if they -#' were positions in the data frame, so expressions like `x:y` can -#' be used to select a range of variables. Any selected columns should -#' -#' @return An object of the same class as `.data` -#' @export -#' -#' @examples -#' d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) -#' d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) -#' tib <- tibble::tibble(g = c("a", "b"), d1 = d1, d2 = d2) -#' -#' pivot_quantiles(tib, c("d1", "d2")) -#' pivot_quantiles(tib, tidyselect::starts_with("d")) -#' pivot_quantiles(tib, d2) -pivot_quantiles <- function(.data, ...) { - expr <- rlang::expr(c(...)) - cols <- names(tidyselect::eval_select(expr, .data)) - dqs <- map_lgl(cols, ~ is_dist_quantiles(.data[[.x]])) - if (!all(dqs)) { - nms <- cols[!dqs] - cli::cli_abort( - "Variables(s) {.var {nms}} are not `dist_quantiles`. Cannot pivot them." - ) - } - .data <- .data %>% - dplyr::mutate(dplyr::across(tidyselect::all_of(cols), nested_quantiles)) - checks <- map_lgl(cols, ~ diff(range(vctrs::list_sizes(.data[[.x]]))) == 0L) - if (!all(checks)) { - nms <- cols[!checks] - cli::cli_abort( - c("Quantiles must be the same length and have the same set of taus.", - i = "Check failed for variables(s) {.var {nms}}." - ) - ) - } - if (length(cols) > 1L) { - for (col in cols) { - .data <- .data %>% - tidyr::unnest(tidyselect::all_of(col)) %>% - tidyr::pivot_wider( - names_from = "tau", values_from = "q", - names_prefix = paste0(col, "_") - ) - } - } else { - .data <- .data %>% - tidyr::unnest(tidyselect::all_of(cols)) %>% - tidyr::pivot_wider(names_from = "tau", values_from = "q") - } - .data -} - - #' @export diff --git a/R/pivot_quantiles.R b/R/pivot_quantiles.R new file mode 100644 index 000000000..94bfde521 --- /dev/null +++ b/R/pivot_quantiles.R @@ -0,0 +1,157 @@ +#' Turn a vector of quantile distributions into a list-col +#' +#' @param x a `distribution` containing `dist_quantiles` +#' +#' @return a list-col +#' @export +#' +#' @examples +#' edf <- case_death_rate_subset[1:3, ] +#' edf$q <- dist_quantiles(list(1:5, 2:4, 3:10), list(1:5 / 6, 2:4 / 5, 3:10 / 11)) +#' +#' edf_nested <- edf %>% dplyr::mutate(q = nested_quantiles(q)) +#' edf_nested %>% tidyr::unnest(q) +nested_quantiles <- function(x) { + stopifnot(is_dist_quantiles(x)) + distributional:::dist_apply(x, .f = function(z) { + tibble::as_tibble(vec_data(z)) %>% + dplyr::mutate(dplyr::across(tidyselect::everything(), as.double)) %>% + list_of() + }) +} + + +#' Pivot columns containing `dist_quantile` longer +#' +#' Selected columns that contains `dist_quantiles` will be "lengthened" with +#' the "taus" (quantile) serving as 1 column and the values as another. If +#' multiple columns are selected, these will be prefixed the the column name. +#' +#' @param .data A data frame, or a data frame extension such as a tibble or +#' epi_df. +#' @param ... <[`tidy-select`][dplyr::dplyr_tidy_select]> One or more unquoted +#' expressions separated by commas. Variable names can be used as if they +#' were positions in the data frame, so expressions like `x:y` can +#' be used to select a range of variables. +#' @param .ignore_length_check If multiple columns are selected, as long as +#' each row has contains the same number of quantiles, the result will be +#' reasonable. But if, for example, `var1[1]` has 5 quantiles while `var2[1]` +#' has 7, then the only option would be to recycle everything, creating a +#' _very_ long result. By default, this would throw an error. But if this is +#' really the goal, then the error can be bypassed by setting this argument +#' to `TRUE`. The first selected column will vary fastest. +#' +#' @return An object of the same class as `.data`. +#' @export +#' +#' @examples +#' d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) +#' d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) +#' tib <- tibble::tibble(g = c("a", "b"), d1 = d1, d2 = d2) +#' +#' pivot_quantiles_longer(tib, "d1") +#' pivot_quantiles_longer(tib, tidyselect::ends_with("1")) +#' pivot_quantiles_longer(tib, d1, d2) +pivot_quantiles_longer <- function(.data, ..., .ignore_length_check = FALSE) { + cols <- validate_pivot_quantiles(.data, ...) + .data <- .data %>% + dplyr::mutate(dplyr::across(tidyselect::all_of(cols), nested_quantiles)) + if (length(cols) > 1L) { + lengths_check <- .data %>% + dplyr::transmute(dplyr::across( + tidyselect::all_of(cols), + ~ map_int(.x, vctrs::vec_size) + )) %>% + as.matrix() %>% + apply(1, function(x) dplyr::n_distinct(x) == 1L) %>% + all() + if (lengths_check) { + .data <- tidyr::unnest(.data, tidyselect::all_of(cols), names_sep = "_") + } else { + if (.ignore_length_check) { + for (col in cols) { + .data <- .data %>% + tidyr::unnest(tidyselect::all_of(col), names_sep = "_") + } + } else { + cli::cli_abort(c( + "Some selected columns contain different numbers of quantiles.", + "The result would be a {.emph very} long {.cls tibble}.", + "To do this anyway, rerun with `.ignore_length_check = TRUE`." + )) + } + } + } else { + .data <- .data %>% tidyr::unnest(tidyselect::all_of(cols)) + } + .data +} + +#' Pivot columns containing `dist_quantile` wider +#' +#' Any selected columns that contain `dist_quantiles` will be "widened" with +#' the "taus" (quantile) serving as names and the values in the data frame. +#' When pivoting multiple columns, the original column name will be used as +#' a prefix. +#' +#' @param .data A data frame, or a data frame extension such as a tibble or +#' epi_df. +#' @param ... <[`tidy-select`][dplyr::dplyr_tidy_select]> One or more unquoted +#' expressions separated by commas. Variable names can be used as if they +#' were positions in the data frame, so expressions like `x:y` can +#' be used to select a range of variables. +#' +#' @return An object of the same class as `.data` +#' @export +#' +#' @examples +#' d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) +#' d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) +#' tib <- tibble::tibble(g = c("a", "b"), d1 = d1, d2 = d2) +#' +#' pivot_quantiles_wider(tib, c("d1", "d2")) +#' pivot_quantiles_wider(tib, tidyselect::starts_with("d")) +#' pivot_quantiles_wider(tib, d2) +pivot_quantiles_wider <- function(.data, ...) { + cols <- validate_pivot_quantiles(.data, ...) + .data <- .data %>% + dplyr::mutate(dplyr::across(tidyselect::all_of(cols), nested_quantiles)) + checks <- map_lgl(cols, ~ diff(range(vctrs::list_sizes(.data[[.x]]))) == 0L) + if (!all(checks)) { + nms <- cols[!checks] + cli::cli_abort( + c("Quantiles must be the same length and have the same set of taus.", + i = "Check failed for variables(s) {.var {nms}}." + ) + ) + } + if (length(cols) > 1L) { + for (col in cols) { + .data <- .data %>% + tidyr::unnest(tidyselect::all_of(col)) %>% + tidyr::pivot_wider( + names_from = "tau", values_from = "q", + names_prefix = paste0(col, "_") + ) + } + } else { + .data <- .data %>% + tidyr::unnest(tidyselect::all_of(cols)) %>% + tidyr::pivot_wider(names_from = "tau", values_from = "q") + } + .data +} + + +validate_pivot_quantiles <- function(.data, ...) { + expr <- rlang::expr(c(...)) + cols <- names(tidyselect::eval_select(expr, .data)) + dqs <- map_lgl(cols, ~ is_dist_quantiles(.data[[.x]])) + if (!all(dqs)) { + nms <- cols[!dqs] + cli::cli_abort( + "Variables(s) {.var {nms}} are not `dist_quantiles`. Cannot pivot them." + ) + } + cols +} diff --git a/_pkgdown.yml b/_pkgdown.yml index 2ad03c277..cfaf9cb41 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -67,7 +67,7 @@ reference: - dist_quantiles - extrapolate_quantiles - nested_quantiles - - pivot_quantiles + - starts_with("pivot_quantiles") - title: Included datasets contents: - case_death_rate_subset diff --git a/man/add_frosting.Rd b/man/add_frosting.Rd index d7d217777..4d77572a1 100644 --- a/man/add_frosting.Rd +++ b/man/add_frosting.Rd @@ -35,7 +35,9 @@ latest <- jhu \%>\% dplyr::filter(time_value >= max(time_value) - 14) # Add frosting to a workflow and predict -f <- frosting() \%>\% layer_predict() \%>\% layer_naomit(.pred) +f <- frosting() \%>\% + layer_predict() \%>\% + layer_naomit(.pred) wf1 <- wf \%>\% add_frosting(f) p1 <- predict(wf1, latest) p1 diff --git a/man/arx_fcast_epi_workflow.Rd b/man/arx_fcast_epi_workflow.Rd index fdd309959..7a6b66305 100644 --- a/man/arx_fcast_epi_workflow.Rd +++ b/man/arx_fcast_epi_workflow.Rd @@ -41,12 +41,16 @@ use \code{\link[=quantile_reg]{quantile_reg()}}) but can be omitted. jhu <- case_death_rate_subset \%>\% dplyr::filter(time_value >= as.Date("2021-12-01")) -arx_fcast_epi_workflow(jhu, "death_rate", - c("case_rate", "death_rate")) +arx_fcast_epi_workflow( + jhu, "death_rate", + c("case_rate", "death_rate") +) arx_fcast_epi_workflow(jhu, "death_rate", - c("case_rate", "death_rate"), trainer = quantile_reg(), - args_list = arx_args_list(levels = 1:9 / 10)) + c("case_rate", "death_rate"), + trainer = quantile_reg(), + args_list = arx_args_list(levels = 1:9 / 10) +) } \seealso{ \code{\link[=arx_forecaster]{arx_forecaster()}} diff --git a/man/arx_forecaster.Rd b/man/arx_forecaster.Rd index d4866aa0e..e121f272c 100644 --- a/man/arx_forecaster.Rd +++ b/man/arx_forecaster.Rd @@ -41,12 +41,16 @@ that it estimates a model for a particular target horizon. jhu <- case_death_rate_subset \%>\% dplyr::filter(time_value >= as.Date("2021-12-01")) -out <- arx_forecaster(jhu, "death_rate", - c("case_rate", "death_rate")) +out <- arx_forecaster( + jhu, "death_rate", + c("case_rate", "death_rate") +) out <- arx_forecaster(jhu, "death_rate", - c("case_rate", "death_rate"), trainer = quantile_reg(), - args_list = arx_args_list(levels = 1:9 / 10)) + c("case_rate", "death_rate"), + trainer = quantile_reg(), + args_list = arx_args_list(levels = 1:9 / 10) +) } \seealso{ \code{\link[=arx_fcast_epi_workflow]{arx_fcast_epi_workflow()}}, \code{\link[=arx_args_list]{arx_args_list()}} diff --git a/man/create_layer.Rd b/man/create_layer.Rd index 399d62efa..d36385fb2 100644 --- a/man/create_layer.Rd +++ b/man/create_layer.Rd @@ -20,9 +20,9 @@ fill in the name of the layer, and open the file. \examples{ \dontrun{ - # Note: running this will write `layer_strawberry.R` to - # the `R/` directory of your current project - create_layer("strawberry") +# Note: running this will write `layer_strawberry.R` to +# the `R/` directory of your current project +create_layer("strawberry") } } diff --git a/man/dist_quantiles.Rd b/man/dist_quantiles.Rd index 50f00dc32..739bae5a8 100644 --- a/man/dist_quantiles.Rd +++ b/man/dist_quantiles.Rd @@ -15,7 +15,7 @@ dist_quantiles(x, tau) A distribution parameterized by a set of quantiles } \examples{ -dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2,.4,.6,.8))) +dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2, .4, .6, .8))) quantile(dstn, p = c(.1, .25, .5, .9)) median(dstn) diff --git a/man/extrapolate_quantiles.Rd b/man/extrapolate_quantiles.Rd index 985d7cae8..cc6cb2c3c 100644 --- a/man/extrapolate_quantiles.Rd +++ b/man/extrapolate_quantiles.Rd @@ -24,12 +24,14 @@ library(distributional) dstn <- dist_normal(c(10, 2), c(5, 10)) extrapolate_quantiles(dstn, p = c(.25, 0.5, .75)) -dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2,.4,.6,.8))) +dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2, .4, .6, .8))) # because this distribution is already quantiles, any extra quantiles are # appended extrapolate_quantiles(dstn, p = c(.25, 0.5, .75)) -dstn <- c(dist_normal(c(10, 2), c(5, 10)), - dist_quantiles(list(1:4, 8:11), list(c(.2,.4,.6,.8)))) +dstn <- c( + dist_normal(c(10, 2), c(5, 10)), + dist_quantiles(list(1:4, 8:11), list(c(.2, .4, .6, .8))) +) extrapolate_quantiles(dstn, p = c(.25, 0.5, .75)) } diff --git a/man/fit-epi_workflow.Rd b/man/fit-epi_workflow.Rd index fb1c3af28..3dfa0029a 100644 --- a/man/fit-epi_workflow.Rd +++ b/man/fit-epi_workflow.Rd @@ -29,7 +29,7 @@ preprocessing the data and fitting the underlying parsnip model. } \examples{ jhu <- case_death_rate_subset \%>\% -filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) + filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% diff --git a/man/flatline.Rd b/man/flatline.Rd index a396cfeb9..c353ff163 100644 --- a/man/flatline.Rd +++ b/man/flatline.Rd @@ -38,8 +38,10 @@ This is an internal function that is used to create a \code{\link[parsnip:linear model. It has somewhat odd behaviour (see below). } \examples{ -tib <- data.frame(y = runif(100), - expand.grid(k = letters[1:4], j = letters[5:9], time_value = 1:5)) \%>\% +tib <- data.frame( + y = runif(100), + expand.grid(k = letters[1:4], j = letters[5:9], time_value = 1:5) +) \%>\% dplyr::group_by(k, j) \%>\% dplyr::mutate(y2 = dplyr::lead(y, 2)) # predict 2 steps ahead flat <- flatline(y2 ~ j + k + y, tib) # predictions for 20 locations diff --git a/man/frosting.Rd b/man/frosting.Rd index 83a8d6a9d..362c40a4f 100644 --- a/man/frosting.Rd +++ b/man/frosting.Rd @@ -24,8 +24,8 @@ The arguments are currently placeholders and must be NULL \examples{ # Toy example to show that frosting can be created and added for postprocessing - f <- frosting() - wf <- epi_workflow() \%>\% add_frosting(f) +f <- frosting() +wf <- epi_workflow() \%>\% add_frosting(f) # A more realistic example jhu <- case_death_rate_subset \%>\% diff --git a/man/layer_add_forecast_date.Rd b/man/layer_add_forecast_date.Rd index 421978eb5..4e173d662 100644 --- a/man/layer_add_forecast_date.Rd +++ b/man/layer_add_forecast_date.Rd @@ -46,15 +46,17 @@ wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) latest <- jhu \%>\% dplyr::filter(time_value >= max(time_value) - 14) - # Don't specify `forecast_date` (by default, this should be last date in latest) -f <- frosting() \%>\% layer_predict() \%>\% - layer_naomit(.pred) +# Don't specify `forecast_date` (by default, this should be last date in latest) +f <- frosting() \%>\% + layer_predict() \%>\% + layer_naomit(.pred) wf0 <- wf \%>\% add_frosting(f) p0 <- predict(wf0, latest) p0 # Specify a `forecast_date` that is greater than or equal to `as_of` date -f <- frosting() \%>\% layer_predict() \%>\% +f <- frosting() \%>\% + layer_predict() \%>\% layer_add_forecast_date(forecast_date = "2022-05-31") \%>\% layer_naomit(.pred) wf1 <- wf \%>\% add_frosting(f) @@ -73,7 +75,7 @@ p2 <- predict(wf2, latest) p2 # Do not specify a forecast_date - f3 <- frosting() \%>\% +f3 <- frosting() \%>\% layer_predict() \%>\% layer_add_forecast_date() \%>\% layer_naomit(.pred) diff --git a/man/layer_add_target_date.Rd b/man/layer_add_target_date.Rd index 58ff7770f..3c2884e10 100644 --- a/man/layer_add_target_date.Rd +++ b/man/layer_add_target_date.Rd @@ -48,7 +48,8 @@ wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) latest <- get_test_data(r, jhu) # Use ahead + forecast date -f <- frosting() \%>\% layer_predict() \%>\% +f <- frosting() \%>\% + layer_predict() \%>\% layer_add_forecast_date(forecast_date = "2022-05-31") \%>\% layer_add_target_date() \%>\% layer_naomit(.pred) @@ -59,7 +60,8 @@ p # Use ahead + max time value from pre, fit, post # which is the same if include `layer_add_forecast_date()` -f2 <- frosting() \%>\% layer_predict() \%>\% +f2 <- frosting() \%>\% + layer_predict() \%>\% layer_add_target_date() \%>\% layer_naomit(.pred) wf2 <- wf \%>\% add_frosting(f2) diff --git a/man/layer_population_scaling.Rd b/man/layer_population_scaling.Rd index e841e9a50..179d6862c 100644 --- a/man/layer_population_scaling.Rd +++ b/man/layer_population_scaling.Rd @@ -78,13 +78,15 @@ jhu <- epiprocess::jhu_csse_daily_subset \%>\% dplyr::filter(time_value > "2021-11-01", geo_value \%in\% c("ca", "ny")) \%>\% dplyr::select(geo_value, time_value, cases) -pop_data = data.frame(states = c("ca", "ny"), value = c(20000, 30000)) +pop_data <- data.frame(states = c("ca", "ny"), value = c(20000, 30000)) r <- epi_recipe(jhu) \%>\% - step_population_scaling(df = pop_data, - df_pop_col = "value", - by = c("geo_value" = "states"), - cases, suffix = "_scaled") \%>\% + step_population_scaling( + df = pop_data, + df_pop_col = "value", + by = c("geo_value" = "states"), + cases, suffix = "_scaled" + ) \%>\% step_epi_lag(cases_scaled, lag = c(0, 7, 14)) \%>\% step_epi_ahead(cases_scaled, ahead = 7, role = "outcome") \%>\% step_epi_naomit() @@ -93,9 +95,11 @@ f <- frosting() \%>\% layer_predict() \%>\% layer_threshold(.pred) \%>\% layer_naomit(.pred) \%>\% - layer_population_scaling(.pred, df = pop_data, - by = c("geo_value" = "states"), - df_pop_col = "value") + layer_population_scaling(.pred, + df = pop_data, + by = c("geo_value" = "states"), + df_pop_col = "value" + ) wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) \%>\% @@ -104,9 +108,12 @@ wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% latest <- get_test_data( recipe = r, x = epiprocess::jhu_csse_daily_subset \%>\% - dplyr::filter(time_value > "2021-11-01", - geo_value \%in\% c("ca", "ny")) \%>\% - dplyr::select(geo_value, time_value, cases)) + dplyr::filter( + time_value > "2021-11-01", + geo_value \%in\% c("ca", "ny") + ) \%>\% + dplyr::select(geo_value, time_value, cases) +) predict(wf, latest) } diff --git a/man/layer_predict.Rd b/man/layer_predict.Rd index 1326dfe75..03473053f 100644 --- a/man/layer_predict.Rd +++ b/man/layer_predict.Rd @@ -62,9 +62,9 @@ jhu <- case_death_rate_subset \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% - step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% - step_epi_ahead(death_rate, ahead = 7) \%>\% - step_epi_naomit() + step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% + step_epi_ahead(death_rate, ahead = 7) \%>\% + step_epi_naomit() wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) latest <- jhu \%>\% filter(time_value >= max(time_value) - 14) diff --git a/man/nested_quantiles.Rd b/man/nested_quantiles.Rd index 1a2824041..143532650 100644 --- a/man/nested_quantiles.Rd +++ b/man/nested_quantiles.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dist_quantiles.R +% Please edit documentation in R/pivot_quantiles.R \name{nested_quantiles} \alias{nested_quantiles} \title{Turn a vector of quantile distributions into a list-col} @@ -16,8 +16,8 @@ a list-col Turn a vector of quantile distributions into a list-col } \examples{ -edf <- case_death_rate_subset[1:3,] -edf$q <- dist_quantiles(list(1:5, 2:4, 3:10), list(1:5/6, 2:4/5, 3:10/11)) +edf <- case_death_rate_subset[1:3, ] +edf$q <- dist_quantiles(list(1:5, 2:4, 3:10), list(1:5 / 6, 2:4 / 5, 3:10 / 11)) edf_nested <- edf \%>\% dplyr::mutate(q = nested_quantiles(q)) edf_nested \%>\% tidyr::unnest(q) diff --git a/man/pivot_quantiles_longer.Rd b/man/pivot_quantiles_longer.Rd new file mode 100644 index 000000000..f29f27cd2 --- /dev/null +++ b/man/pivot_quantiles_longer.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pivot_quantiles.R +\name{pivot_quantiles_longer} +\alias{pivot_quantiles_longer} +\title{Pivot columns containing \code{dist_quantile} longer} +\usage{ +pivot_quantiles_longer(.data, ..., .ignore_length_check = FALSE) +} +\arguments{ +\item{.data}{A data frame, or a data frame extension such as a tibble or +epi_df.} + +\item{...}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> One or more unquoted +expressions separated by commas. Variable names can be used as if they +were positions in the data frame, so expressions like \code{x:y} can +be used to select a range of variables.} + +\item{.ignore_length_check}{If multiple columns are selected, as long as +each row has contains the same number of quantiles, the result will be +reasonable. But if, for example, \code{var1[1]} has 5 quantiles while \code{var2[1]} +has 7, then the only option would be to recycle everything, creating a +\emph{very} long result. By default, this would throw an error. But if this is +really the goal, then the error can be bypassed by setting this argument +to \code{TRUE}.} +} +\value{ +An object of the same class as \code{.data}. +} +\description{ +Selected columns that contains \code{dist_quantiles} will be "lengthened" with +the "taus" (quantile) serving as 1 column and the values as another. If +multiple columns are selected, these will be prefixed the the column name. +} +\examples{ +d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) +d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) +tib <- tibble::tibble(g = c("a", "b"), d1 = d1, d2 = d2) + +pivot_quantiles_longer(tib, "d1") +pivot_quantiles_longer(tib, tidyselect::ends_with("1")) +pivot_quantiles_longer(tib, d1, d2) +} diff --git a/man/pivot_quantiles.Rd b/man/pivot_quantiles_wider.Rd similarity index 75% rename from man/pivot_quantiles.Rd rename to man/pivot_quantiles_wider.Rd index 0ed6588ed..02a33bb2f 100644 --- a/man/pivot_quantiles.Rd +++ b/man/pivot_quantiles_wider.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dist_quantiles.R -\name{pivot_quantiles} -\alias{pivot_quantiles} +% Please edit documentation in R/pivot_quantiles.R +\name{pivot_quantiles_wider} +\alias{pivot_quantiles_wider} \title{Pivot columns containing \code{dist_quantile} wider} \usage{ -pivot_quantiles(.data, ...) +pivot_quantiles_wider(.data, ...) } \arguments{ \item{.data}{A data frame, or a data frame extension such as a tibble or @@ -13,7 +13,7 @@ epi_df.} \item{...}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> One or more unquoted expressions separated by commas. Variable names can be used as if they were positions in the data frame, so expressions like \code{x:y} can -be used to select a range of variables. Any selected columns should} +be used to select a range of variables.} } \value{ An object of the same class as \code{.data} @@ -29,7 +29,7 @@ d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) tib <- tibble::tibble(g = c("a", "b"), d1 = d1, d2 = d2) -pivot_quantiles(tib, c("d1", "d2")) -pivot_quantiles(tib, tidyselect::starts_with("d")) -pivot_quantiles(tib, d2) +pivot_quantiles_wider(tib, c("d1", "d2")) +pivot_quantiles_wider(tib, tidyselect::starts_with("d")) +pivot_quantiles_wider(tib, d2) } diff --git a/man/smooth_quantile_reg.Rd b/man/smooth_quantile_reg.Rd index 293999876..6cc2dfc82 100644 --- a/man/smooth_quantile_reg.Rd +++ b/man/smooth_quantile_reg.Rd @@ -39,9 +39,10 @@ only supported engine is \code{\link[smoothqr:smooth_qr]{smoothqr::smooth_qr()}} tib <- data.frame( y1 = rnorm(100), y2 = rnorm(100), y3 = rnorm(100), y4 = rnorm(100), y5 = rnorm(100), y6 = rnorm(100), - x1 = rnorm(100), x2 = rnorm(100)) + x1 = rnorm(100), x2 = rnorm(100) +) qr_spec <- smooth_quantile_reg(tau = c(.2, .5, .8), outcome_locations = 1:6) -ff <- qr_spec \%>\% fit(cbind(y1, y2 , y3 , y4 , y5 , y6) ~ ., data = tib) +ff <- qr_spec \%>\% fit(cbind(y1, y2, y3, y4, y5, y6) ~ ., data = tib) p <- predict(ff, new_data = tib) x <- -99:99 / 100 * 2 * pi @@ -50,21 +51,23 @@ fd <- x[length(x) - 20] XY <- smoothqr::lagmat(y[1:(length(y) - 20)], c(-20:20)) XY <- tibble::as_tibble(XY) qr_spec <- smooth_quantile_reg(tau = c(.2, .5, .8), outcome_locations = 20:1) -tt <- qr_spec \%>\% fit_xy(x = XY[,21:41], y = XY[,1:20]) +tt <- qr_spec \%>\% fit_xy(x = XY[, 21:41], y = XY[, 1:20]) library(tidyr) library(dplyr) pl <- predict( - object = tt, - new_data = XY[max(which(complete.cases(XY[,21:41]))), 21:41] - ) + object = tt, + new_data = XY[max(which(complete.cases(XY[, 21:41]))), 21:41] +) pl <- pl \%>\% - unnest(.pred) \%>\% - mutate(distn = nested_quantiles(distn)) \%>\% - unnest(distn) \%>\% - mutate(x = x[length(x) - 20] + ahead / 100 * 2 * pi, - ahead = NULL) \%>\% - pivot_wider(names_from = tau, values_from = q) + unnest(.pred) \%>\% + mutate(distn = nested_quantiles(distn)) \%>\% + unnest(distn) \%>\% + mutate( + x = x[length(x) - 20] + ahead / 100 * 2 * pi, + ahead = NULL + ) \%>\% + pivot_wider(names_from = tau, values_from = q) plot(x, y, pch = 16, xlim = c(pi, 2 * pi), col = "lightgrey") curve(sin(x), add = TRUE) abline(v = fd, lty = 2) diff --git a/man/step_epi_shift.Rd b/man/step_epi_shift.Rd index ca8609b1e..bf135346e 100644 --- a/man/step_epi_shift.Rd +++ b/man/step_epi_shift.Rd @@ -90,7 +90,7 @@ are always set to \code{"ahead_"} and \code{"epi_ahead"} respectively, while for \examples{ r <- epi_recipe(case_death_rate_subset) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% - step_epi_lag(death_rate, lag = c(0,7,14)) + step_epi_lag(death_rate, lag = c(0, 7, 14)) r } \seealso{ diff --git a/man/step_growth_rate.Rd b/man/step_growth_rate.Rd index 0449b887c..b409135b1 100644 --- a/man/step_growth_rate.Rd +++ b/man/step_growth_rate.Rd @@ -87,7 +87,9 @@ r <- epi_recipe(case_death_rate_subset) \%>\% step_growth_rate(case_rate, death_rate) r -r \%>\% recipes::prep() \%>\% recipes::bake(case_death_rate_subset) +r \%>\% + recipes::prep() \%>\% + recipes::bake(case_death_rate_subset) } \seealso{ Other row operation steps: diff --git a/man/step_lag_difference.Rd b/man/step_lag_difference.Rd index d69c25faa..b06abe43c 100644 --- a/man/step_lag_difference.Rd +++ b/man/step_lag_difference.Rd @@ -59,7 +59,9 @@ r <- epi_recipe(case_death_rate_subset) \%>\% step_lag_difference(case_rate, death_rate, horizon = c(7, 14)) r -r \%>\% recipes::prep() \%>\% recipes::bake(case_death_rate_subset) +r \%>\% + recipes::prep() \%>\% + recipes::bake(case_death_rate_subset) } \seealso{ Other row operation steps: diff --git a/man/step_population_scaling.Rd b/man/step_population_scaling.Rd index 2964c6912..1a9564563 100644 --- a/man/step_population_scaling.Rd +++ b/man/step_population_scaling.Rd @@ -104,13 +104,15 @@ jhu <- epiprocess::jhu_csse_daily_subset \%>\% dplyr::filter(time_value > "2021-11-01", geo_value \%in\% c("ca", "ny")) \%>\% dplyr::select(geo_value, time_value, cases) -pop_data = data.frame(states = c("ca", "ny"), value = c(20000, 30000)) +pop_data <- data.frame(states = c("ca", "ny"), value = c(20000, 30000)) r <- epi_recipe(jhu) \%>\% - step_population_scaling(df = pop_data, - df_pop_col = "value", - by = c("geo_value" = "states"), - cases, suffix = "_scaled") \%>\% + step_population_scaling( + df = pop_data, + df_pop_col = "value", + by = c("geo_value" = "states"), + cases, suffix = "_scaled" + ) \%>\% step_epi_lag(cases_scaled, lag = c(0, 7, 14)) \%>\% step_epi_ahead(cases_scaled, ahead = 7, role = "outcome") \%>\% step_epi_naomit() @@ -119,9 +121,11 @@ f <- frosting() \%>\% layer_predict() \%>\% layer_threshold(.pred) \%>\% layer_naomit(.pred) \%>\% - layer_population_scaling(.pred, df = pop_data, - by = c("geo_value" = "states"), - df_pop_col = "value") + layer_population_scaling(.pred, + df = pop_data, + by = c("geo_value" = "states"), + df_pop_col = "value" + ) wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) \%>\% @@ -130,8 +134,10 @@ wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% latest <- get_test_data( recipe = r, epiprocess::jhu_csse_daily_subset \%>\% - dplyr::filter(time_value > "2021-11-01", - geo_value \%in\% c("ca", "ny")) \%>\% + dplyr::filter( + time_value > "2021-11-01", + geo_value \%in\% c("ca", "ny") + ) \%>\% dplyr::select(geo_value, time_value, cases) ) diff --git a/man/step_training_window.Rd b/man/step_training_window.Rd index 7861f27ea..ce7c0fc74 100644 --- a/man/step_training_window.Rd +++ b/man/step_training_window.Rd @@ -50,9 +50,12 @@ after any filtering step. tib <- tibble::tibble( x = 1:10, y = 1:10, - time_value = rep(seq(as.Date("2020-01-01"), by = 1, - length.out = 5), times = 2), - geo_value = rep(c("ca", "hi"), each = 5)) \%>\% + time_value = rep(seq(as.Date("2020-01-01"), + by = 1, + length.out = 5 + ), times = 2), + geo_value = rep(c("ca", "hi"), each = 5) +) \%>\% as_epi_df() epi_recipe(y ~ x, data = tib) \%>\% diff --git a/tests/testthat/test-pivot_quantiles.R b/tests/testthat/test-pivot_quantiles.R index 85694aace..cdf84f28d 100644 --- a/tests/testthat/test-pivot_quantiles.R +++ b/tests/testthat/test-pivot_quantiles.R @@ -1,26 +1,68 @@ -test_that("quantile pivotting behaves", { +test_that("quantile pivotting wider behaves", { tib <- tibble::tibble(a = 1:5, b = 6:10) - expect_error(pivot_quantiles(tib, a)) + expect_error(pivot_quantiles_wider(tib, a)) tib$c <- rep(dist_normal(), 5) - expect_error(pivot_quantiles(tib, c)) + expect_error(pivot_quantiles_wider(tib, c)) d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:5, 1:4 / 5)) # different quantiles tib <- tib[1:2, ] tib$d1 <- d1 - expect_error(pivot_quantiles(tib, d1)) + expect_error(pivot_quantiles_wider(tib, d1)) d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 2:4 / 4)) tib$d1 <- d1 # would want to error (mismatched quantiles), but hard to check efficiently - expect_silent(pivot_quantiles(tib, d1)) + expect_silent(pivot_quantiles_wider(tib, d1)) d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) tib <- tibble::tibble(g = c("a", "b"), d1 = d1, d2 = d2) - expect_length(pivot_quantiles(tib, c("d1", "d2")), 7L) - expect_length(pivot_quantiles(tib, tidyselect::starts_with("d")), 7L) - expect_length(pivot_quantiles(tib, d2), 5L) + expect_length(pivot_quantiles_wider(tib, c("d1", "d2")), 7L) + expect_length(pivot_quantiles_wider(tib, tidyselect::starts_with("d")), 7L) + expect_length(pivot_quantiles_wider(tib, d2), 5L) +}) + + +test_that("quantile pivotting longer behaves", { + tib <- tibble::tibble(a = 1:5, b = 6:10) + expect_error(pivot_quantiles_longer(tib, a)) + tib$c <- rep(dist_normal(), 5) + expect_error(pivot_quantiles_longer(tib, c)) + + d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:5, 1:4 / 5)) + # different quantiles + tib <- tib[1:2, ] + tib$d1 <- d1 + expect_length(pivot_quantiles_longer(tib, d1), 5L) + expect_identical(nrow(pivot_quantiles_longer(tib, d1)), 7L) + expect_identical(pivot_quantiles_longer(tib, d1)$q, as.double(c(1:3, 2:5))) + + d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 2:4 / 4)) + tib$d1 <- d1 + expect_silent(pivot_quantiles_longer(tib, d1)) + + d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) + d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) + tib <- tibble::tibble(g = c("a", "b"), d1 = d1, d2 = d2) + + + expect_length(pivot_quantiles_longer(tib, c("d1", "d2")), 5L) + expect_identical(nrow(pivot_quantiles_longer(tib, c("d1", "d2"))), 6L) + expect_silent(pivot_quantiles_longer(tib, tidyselect::starts_with("d"))) + expect_length(pivot_quantiles_longer(tib, d2), 5L) + + tib$d3 <- c(dist_quantiles(2:5, 2:5 / 6), dist_quantiles(3:6, 2:5 / 6)) + # now the cols have different numbers of quantiles + expect_error(pivot_quantiles_longer(tib, d1, d3)) + expect_length( + pivot_quantiles_longer(tib, d1, d3, .ignore_length_check = TRUE), + 6L + ) + expect_identical( + pivot_quantiles_longer(tib, d1, d3, .ignore_length_check = TRUE)$d1_q, + as.double(rep(c(1:3, 2:4), each = 4)) + ) }) From f97166b0d3d2ed9f658f4b0e5db8f0be00c0b5ea Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 25 Sep 2023 12:46:50 -0700 Subject: [PATCH 11/22] fix extra check note. --- .Rbuildignore | 1 + man/pivot_quantiles_longer.Rd | 2 +- tests/testthat/test-pivot_quantiles.R | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index 5139bcabe..cb36bb9d2 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -12,3 +12,4 @@ ^musings$ ^data-raw$ ^vignettes/articles$ +^.git-blame-ignore-revs$ diff --git a/man/pivot_quantiles_longer.Rd b/man/pivot_quantiles_longer.Rd index f29f27cd2..1cb6f5165 100644 --- a/man/pivot_quantiles_longer.Rd +++ b/man/pivot_quantiles_longer.Rd @@ -21,7 +21,7 @@ reasonable. But if, for example, \code{var1[1]} has 5 quantiles while \code{var2 has 7, then the only option would be to recycle everything, creating a \emph{very} long result. By default, this would throw an error. But if this is really the goal, then the error can be bypassed by setting this argument -to \code{TRUE}.} +to \code{TRUE}. The first selected column will vary fastest.} } \value{ An object of the same class as \code{.data}. diff --git a/tests/testthat/test-pivot_quantiles.R b/tests/testthat/test-pivot_quantiles.R index cdf84f28d..9928c5e09 100644 --- a/tests/testthat/test-pivot_quantiles.R +++ b/tests/testthat/test-pivot_quantiles.R @@ -52,7 +52,7 @@ test_that("quantile pivotting longer behaves", { expect_length(pivot_quantiles_longer(tib, c("d1", "d2")), 5L) expect_identical(nrow(pivot_quantiles_longer(tib, c("d1", "d2"))), 6L) expect_silent(pivot_quantiles_longer(tib, tidyselect::starts_with("d"))) - expect_length(pivot_quantiles_longer(tib, d2), 5L) + expect_length(pivot_quantiles_longer(tib, d2), 4L) tib$d3 <- c(dist_quantiles(2:5, 2:5 / 6), dist_quantiles(3:6, 2:5 / 6)) # now the cols have different numbers of quantiles From 9dd0a2c19a8772dcc5c4104f85429550d3f8aff5 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 25 Sep 2023 12:47:36 -0700 Subject: [PATCH 12/22] run styler --- R/step_growth_rate.R | 27 +++++++++++++-------------- R/step_lag_difference.R | 19 +++++++++---------- 2 files changed, 22 insertions(+), 24 deletions(-) diff --git a/R/step_growth_rate.R b/R/step_growth_rate.R index f6ad29a5b..74cfff284 100644 --- a/R/step_growth_rate.R +++ b/R/step_growth_rate.R @@ -42,20 +42,19 @@ #' recipes::prep() %>% #' recipes::bake(case_death_rate_subset) step_growth_rate <- - function( - recipe, - ..., - role = "predictor", - trained = FALSE, - horizon = 7, - method = c("rel_change", "linear_reg", "smooth_spline", "trend_filter"), - log_scale = FALSE, - replace_Inf = NA, - prefix = "gr_", - columns = NULL, - skip = FALSE, - id = rand_id("growth_rate"), - additional_gr_args_list = list()) { + function(recipe, + ..., + role = "predictor", + trained = FALSE, + horizon = 7, + method = c("rel_change", "linear_reg", "smooth_spline", "trend_filter"), + log_scale = FALSE, + replace_Inf = NA, + prefix = "gr_", + columns = NULL, + skip = FALSE, + id = rand_id("growth_rate"), + additional_gr_args_list = list()) { if (!is_epi_recipe(recipe)) { rlang::abort("This recipe step can only operate on an `epi_recipe`.") } diff --git a/R/step_lag_difference.R b/R/step_lag_difference.R index 2482be46a..21878eaa7 100644 --- a/R/step_lag_difference.R +++ b/R/step_lag_difference.R @@ -23,16 +23,15 @@ #' recipes::prep() %>% #' recipes::bake(case_death_rate_subset) step_lag_difference <- - function( - recipe, - ..., - role = "predictor", - trained = FALSE, - horizon = 7, - prefix = "lag_diff_", - columns = NULL, - skip = FALSE, - id = rand_id("lag_diff")) { + function(recipe, + ..., + role = "predictor", + trained = FALSE, + horizon = 7, + prefix = "lag_diff_", + columns = NULL, + skip = FALSE, + id = rand_id("lag_diff")) { if (!is_epi_recipe(recipe)) { rlang::abort("This recipe step can only operate on an `epi_recipe`.") } From 16139fffa1335efe250f350d5d6b6b020061cc64 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Fri, 29 Sep 2023 16:21:37 -0700 Subject: [PATCH 13/22] add lifecycle, deprecate pivot_quantiles. --- DESCRIPTION | 1 + NAMESPACE | 1 + R/epipredict-package.R | 3 +++ R/pivot_quantiles.R | 3 +++ man/figures/lifecycle-archived.svg | 21 ++++++++++++++++ man/figures/lifecycle-defunct.svg | 21 ++++++++++++++++ man/figures/lifecycle-deprecated.svg | 21 ++++++++++++++++ man/figures/lifecycle-experimental.svg | 21 ++++++++++++++++ man/figures/lifecycle-maturing.svg | 21 ++++++++++++++++ man/figures/lifecycle-questioning.svg | 21 ++++++++++++++++ man/figures/lifecycle-soft-deprecated.svg | 21 ++++++++++++++++ man/figures/lifecycle-stable.svg | 29 +++++++++++++++++++++++ man/figures/lifecycle-superseded.svg | 21 ++++++++++++++++ 13 files changed, 205 insertions(+) create mode 100644 man/figures/lifecycle-archived.svg create mode 100644 man/figures/lifecycle-defunct.svg create mode 100644 man/figures/lifecycle-deprecated.svg create mode 100644 man/figures/lifecycle-experimental.svg create mode 100644 man/figures/lifecycle-maturing.svg create mode 100644 man/figures/lifecycle-questioning.svg create mode 100644 man/figures/lifecycle-soft-deprecated.svg create mode 100644 man/figures/lifecycle-stable.svg create mode 100644 man/figures/lifecycle-superseded.svg diff --git a/DESCRIPTION b/DESCRIPTION index 75602f072..eb6405df4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -32,6 +32,7 @@ Imports: generics, glue, hardhat (>= 1.3.0), + lifecycle, magrittr, methods, quantreg, diff --git a/NAMESPACE b/NAMESPACE index c97dc9018..c18b1858c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -168,6 +168,7 @@ importFrom(generics,augment) importFrom(generics,fit) importFrom(hardhat,refresh_blueprint) importFrom(hardhat,run_mold) +importFrom(lifecycle,deprecated) importFrom(magrittr,"%>%") importFrom(methods,is) importFrom(quantreg,rq) diff --git a/R/epipredict-package.R b/R/epipredict-package.R index 51478065b..da4991feb 100644 --- a/R/epipredict-package.R +++ b/R/epipredict-package.R @@ -1,5 +1,8 @@ +## usethis namespace: start #' @importFrom tibble tibble #' @importFrom rlang := !! #' @importFrom stats poly predict lm residuals quantile +#' @importFrom lifecycle deprecated #' @import epiprocess parsnip +## usethis namespace: end NULL diff --git a/R/pivot_quantiles.R b/R/pivot_quantiles.R index 94bfde521..de4aa1e01 100644 --- a/R/pivot_quantiles.R +++ b/R/pivot_quantiles.R @@ -142,6 +142,9 @@ pivot_quantiles_wider <- function(.data, ...) { .data } +pivot_quantiles <- function(.data, ...) { + lifecycle::deprecate_stop("0.0.6", "pivot_quantiles()", "pivot_quantiles_wider()") +} validate_pivot_quantiles <- function(.data, ...) { expr <- rlang::expr(c(...)) diff --git a/man/figures/lifecycle-archived.svg b/man/figures/lifecycle-archived.svg new file mode 100644 index 000000000..745ab0c78 --- /dev/null +++ b/man/figures/lifecycle-archived.svg @@ -0,0 +1,21 @@ + + lifecycle: archived + + + + + + + + + + + + + + + lifecycle + + archived + + diff --git a/man/figures/lifecycle-defunct.svg b/man/figures/lifecycle-defunct.svg new file mode 100644 index 000000000..d5c9559ed --- /dev/null +++ b/man/figures/lifecycle-defunct.svg @@ -0,0 +1,21 @@ + + lifecycle: defunct + + + + + + + + + + + + + + + lifecycle + + defunct + + diff --git a/man/figures/lifecycle-deprecated.svg b/man/figures/lifecycle-deprecated.svg new file mode 100644 index 000000000..b61c57c3f --- /dev/null +++ b/man/figures/lifecycle-deprecated.svg @@ -0,0 +1,21 @@ + + lifecycle: deprecated + + + + + + + + + + + + + + + lifecycle + + deprecated + + diff --git a/man/figures/lifecycle-experimental.svg b/man/figures/lifecycle-experimental.svg new file mode 100644 index 000000000..5d88fc2c6 --- /dev/null +++ b/man/figures/lifecycle-experimental.svg @@ -0,0 +1,21 @@ + + lifecycle: experimental + + + + + + + + + + + + + + + lifecycle + + experimental + + diff --git a/man/figures/lifecycle-maturing.svg b/man/figures/lifecycle-maturing.svg new file mode 100644 index 000000000..897370ecf --- /dev/null +++ b/man/figures/lifecycle-maturing.svg @@ -0,0 +1,21 @@ + + lifecycle: maturing + + + + + + + + + + + + + + + lifecycle + + maturing + + diff --git a/man/figures/lifecycle-questioning.svg b/man/figures/lifecycle-questioning.svg new file mode 100644 index 000000000..7c1721d05 --- /dev/null +++ b/man/figures/lifecycle-questioning.svg @@ -0,0 +1,21 @@ + + lifecycle: questioning + + + + + + + + + + + + + + + lifecycle + + questioning + + diff --git a/man/figures/lifecycle-soft-deprecated.svg b/man/figures/lifecycle-soft-deprecated.svg new file mode 100644 index 000000000..9c166ff30 --- /dev/null +++ b/man/figures/lifecycle-soft-deprecated.svg @@ -0,0 +1,21 @@ + + lifecycle: soft-deprecated + + + + + + + + + + + + + + + lifecycle + + soft-deprecated + + diff --git a/man/figures/lifecycle-stable.svg b/man/figures/lifecycle-stable.svg new file mode 100644 index 000000000..9bf21e76b --- /dev/null +++ b/man/figures/lifecycle-stable.svg @@ -0,0 +1,29 @@ + + lifecycle: stable + + + + + + + + + + + + + + + + lifecycle + + + + stable + + + diff --git a/man/figures/lifecycle-superseded.svg b/man/figures/lifecycle-superseded.svg new file mode 100644 index 000000000..db8d757f7 --- /dev/null +++ b/man/figures/lifecycle-superseded.svg @@ -0,0 +1,21 @@ + + lifecycle: superseded + + + + + + + + + + + + + + + lifecycle + + superseded + + From c9b4667ccecf9d36de71383ef94fda0fa84ba996 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Wed, 4 Oct 2023 12:30:41 -0700 Subject: [PATCH 14/22] working cdc baseline --- NAMESPACE | 3 + R/cdc_baseline_forecaster.R | 228 ++++++++++++++++++++++++++++ R/layer_cdc_flatline_quantiles.R | 17 ++- man/cdc_baseline_args_list.Rd | 85 +++++++++++ man/cdc_baseline_forecaster.Rd | 73 +++++++++ man/layer_cdc_flatline_quantiles.Rd | 14 +- man/smooth_quantile_reg.Rd | 3 +- tests/testthat/test-parse_period.R | 12 ++ 8 files changed, 419 insertions(+), 16 deletions(-) create mode 100644 R/cdc_baseline_forecaster.R create mode 100644 man/cdc_baseline_args_list.Rd create mode 100644 man/cdc_baseline_forecaster.Rd create mode 100644 tests/testthat/test-parse_period.R diff --git a/NAMESPACE b/NAMESPACE index e361dec00..d7e941b0f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -52,6 +52,7 @@ S3method(print,alist) S3method(print,arx_class) S3method(print,arx_fcast) S3method(print,canned_epipred) +S3method(print,cdc_baseline_fcast) S3method(print,epi_workflow) S3method(print,flat_fcast) S3method(print,flatline) @@ -107,6 +108,8 @@ export(arx_classifier) export(arx_fcast_epi_workflow) export(arx_forecaster) export(bake) +export(cdc_baseline_args_list) +export(cdc_baseline_forecaster) export(create_layer) export(default_epi_recipe_blueprint) export(detect_layer) diff --git a/R/cdc_baseline_forecaster.R b/R/cdc_baseline_forecaster.R new file mode 100644 index 000000000..f5961431d --- /dev/null +++ b/R/cdc_baseline_forecaster.R @@ -0,0 +1,228 @@ +#' Predict the future with the most recent value +#' +#' This is a simple forecasting model for +#' [epiprocess::epi_df] data. It uses the most recent observation as the +#' forecast for any future date, and produces intervals by shuffling the quantiles +#' of the residuals of such a "flatline" forecast and incrementing these +#' forward over all available training data. +#' +#' By default, the predictive intervals are computed separately for each +#' combination of `geo_value` in the `epi_data` argument. +#' +#' This forecaster is meant to produce exactly the CDC Baseline used for +#' [COVID19ForecastHub](https://covid19forecasthub.org) +#' +#' @param epi_data An [epiprocess::epi_df] +#' @param outcome A scalar character for the column name we wish to predict. +#' @param args_list A list of additional arguments as created by the +#' [cdc_baseline_args_list()] constructor function. +#' +#' @return A data frame of point and interval forecasts at for all +#' aheads (unique horizons) for each unique combination of `key_vars`. +#' @export +#' +#' @examples +#' library(dplyr) +#' weekly_deaths <- case_death_rate_subset %>% +#' select(geo_value, time_value, death_rate) %>% +#' left_join(state_census %>% select(pop, abbr), by = c("geo_value" = "abbr")) %>% +#' mutate(deaths = pmax(death_rate / 1e5 * pop, 0)) %>% +#' select(-pop, -death_rate) %>% +#' group_by(geo_value) %>% +#' epi_slide(~ sum(.$deaths), before = 6, new_col_name = "deaths") %>% +#' ungroup() %>% +#' filter(weekdays(time_value) == "Saturday") +#' +#' cdc <- cdc_baseline_forecaster(deaths, "deaths") +#' preds <- pivot_quantiles(cdc$predictions, .pred_distn) +#' +#' if (require(ggplot2)) { +#' forecast_date <- unique(preds$forecast_date) +#' four_states <- c("ca", "pa", "wa", "ny") +#' preds %>% +#' filter(geo_value %in% four_states) %>% +#' ggplot(aes(target_date)) + +#' geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + +#' geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + +#' geom_line(aes(y = .pred), color = "orange") + +#' geom_line( +#' data = deaths %>% filter(geo_value %in% four_states), +#' aes(x = time_value, y = deaths) +#' ) + +#' scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + +#' labs(x = "Date", y = "Weekly deaths") + +#' facet_wrap(~geo_value, scales = "free_y") + +#' theme_bw() + +#' geom_vline(xintercept = forecast_date) +#' } +cdc_baseline_forecaster <- function( + epi_data, + outcome, + args_list = cdc_baseline_args_list()) { + validate_forecaster_inputs(epi_data, outcome, "time_value") + if (!inherits(args_list, c("cdc_flat_fcast", "alist"))) { + cli_stop("args_list was not created using `cdc_baseline_args_list().") + } + keys <- epi_keys(epi_data) + ek <- kill_time_value(keys) + outcome <- rlang::sym(outcome) + + + r <- epi_recipe(epi_data) %>% + step_epi_ahead(!!outcome, ahead = args_list$data_frequency, skip = TRUE) %>% + recipes::update_role(!!outcome, new_role = "predictor") %>% + recipes::add_role(tidyselect::all_of(keys), new_role = "predictor") %>% + step_training_window(n_recent = args_list$n_training) + + forecast_date <- args_list$forecast_date %||% max(epi_data$time_value) + # target_date <- args_list$target_date %||% forecast_date + args_list$ahead + + + latest <- get_test_data( + epi_recipe(epi_data), epi_data, TRUE, args_list$nafill_buffer, + forecast_date + ) + + f <- frosting() %>% + layer_predict() %>% + layer_cdc_flatline_quantiles( + aheads = args_list$aheads, + quantile_levels = args_list$quantile_levels, + nsims = args_list$nsims, + by_key = args_list$quantile_by_key, + symmetrize = args_list$symmetrize, + nonneg = args_list$nonneg + ) %>% + layer_add_forecast_date(forecast_date = forecast_date) %>% + layer_unnest(.pred_distn_all) + # layer_add_target_date(target_date = target_date) + if (args_list$nonneg) f <- layer_threshold(f, ".pred") + + eng <- parsnip::linear_reg() %>% parsnip::set_engine("flatline") + + wf <- epi_workflow(r, eng, f) + wf <- generics::fit(wf, epi_data) + preds <- suppressWarnings(predict(wf, new_data = latest)) %>% + tibble::as_tibble() %>% + dplyr::select(-time_value) %>% + dplyr::mutate(target_date = forecast_date + ahead * args_list$data_frequency) + + structure( + list( + predictions = preds, + epi_workflow = wf, + metadata = list( + training = attr(epi_data, "metadata"), + forecast_created = Sys.time() + ) + ), + class = c("cdc_baseline_fcast", "canned_epipred") + ) +} + + + +#' CDC baseline forecaster argument constructor +#' +#' Constructs a list of arguments for [cdc_baseline_forecaster()]. +#' +#' @inheritParams arx_args_list +#' @param data_frequency Integer or string. This describes the frequency of the +#' input `epi_df`. For typical FluSight forecasts, this would be `"1 week"`. +#' Allowable arguments are integers (taken to mean numbers of days) or a +#' string like `"7 days"` or `"2 weeks"`. Currently, all other periods +#' (other than days or weeks) result in an error. +#' @param aheads Integer vector. Unlike [arx_forecaster()], this doesn't have +#' any effect on the predicted values. +#' Predictions are always the most recent observation. This determines the +#' set of prediction horizons for [layer_cdc_flatline_quantiles()]`. It interacts +#' with the `data_frequency` argument. So, for example, if the data is daily +#' and you want forecasts for 1:4 days ahead, then you would use `1:4`. However, +#' if you want one-week predictions, you would set this as `c(7, 14, 21, 28)`. +#' But if `data_frequency` is `"1 week"`, then you would set it as `1:4`. +#' @param quantile_levels Vector or `NULL`. A vector of probabilities to produce +#' prediction intervals. These are created by computing the quantiles of +#' training residuals. A `NULL` value will result in point forecasts only. +#' @param nsims Positive integer. The number of draws from the empirical CDF. +#' These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting +#' in linear interpolation on the X scale. This is achieved with +#' [stats::quantile()] Type 7 (the default for that function). +#' @param nonneg Logical. Force all predictive intervals be non-negative. +#' Because non-negativity is forced _before_ propagating forward, this +#' has slightly different behaviour than would occur if using +#' [layer_threshold_preds()]. +#' +#' @return A list containing updated parameter choices with class `cdc_flat_fcast`. +#' @export +#' +#' @examples +#' cdc_baseline_args_list() +#' cdc_baseline_args_list(symmetrize = FALSE) +#' cdc_baseline_args_list(levels = c(.1, .3, .7, .9), n_training = 120) +cdc_baseline_args_list <- function( + data_frequency = "1 week", + aheads = 1:4, + n_training = Inf, + forecast_date = NULL, + quantile_levels = c(.01, .025, 1:19 / 20, .975, .99), + nsims = 1e3L, + symmetrize = TRUE, + nonneg = TRUE, + quantile_by_key = "geo_value", + nafill_buffer = Inf) { + arg_is_scalar(n_training, nsims, data_frequency) + data_frequency <- parse_period(data_frequency) + arg_is_pos_int(data_frequency) + arg_is_chr(quantile_by_key, allow_empty = TRUE) + arg_is_scalar(forecast_date, allow_null = TRUE) + arg_is_date(forecast_date, allow_null = TRUE) + arg_is_nonneg_int(aheads, nsims) + arg_is_lgl(symmetrize, nonneg) + arg_is_probabilities(quantile_levels, allow_null = TRUE) + arg_is_pos(n_training) + if (is.finite(n_training)) arg_is_pos_int(n_training) + if (is.finite(nafill_buffer)) arg_is_pos_int(nafill_buffer, allow_null = TRUE) + + structure( + enlist( + data_frequency, + aheads, + n_training, + forecast_date, + quantile_levels, + nsims, + symmetrize, + nonneg, + quantile_by_key, + nafill_buffer + ), + class = c("cdc_baseline_fcast", "alist") + ) +} + +#' @export +print.cdc_baseline_fcast <- function(x, ...) { + name <- "CDC Baseline" + NextMethod(name = name, ...) +} + +parse_period <- function(x) { + arg_is_scalar(x) + if (is.character(x)) { + x <- unlist(strsplit(x, " ")) + if (length(x) == 1L) x <- as.numeric(x) + if (length(x) == 2L) { + mult <- substr(x[2], 1, 3) + mult <- switch( + mult, + day = 1L, + wee = 7L, + cli::cli_abort("incompatible timespan in `aheads`.") + ) + x <- as.numeric(x[1]) * mult + } + if (length(x) > 2L) cli::cli_abort("incompatible timespan in `aheads`.") + } + stopifnot(rlang::is_integerish(x)) + as.integer(x) +} diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index 7ff224359..aa953ad45 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -97,7 +97,7 @@ layer_cdc_flatline_quantiles <- function( frosting, ..., aheads = 1:4, - quantiles = c(.01, .025, 1:19 / 20, .975, .99), + quantile_levels = c(.01, .025, 1:19 / 20, .975, .99), nsims = 1e3, by_key = "geo_value", symmetrize = FALSE, @@ -106,7 +106,7 @@ layer_cdc_flatline_quantiles <- function( rlang::check_dots_empty() arg_is_int(aheads) - arg_is_probabilities(quantiles) + arg_is_probabilities(quantile_levels, allow_null = TRUE) arg_is_pos_int(nsims) arg_is_scalar(nsims) arg_is_chr_scalar(id) @@ -117,7 +117,7 @@ layer_cdc_flatline_quantiles <- function( frosting, layer_cdc_flatline_quantiles_new( aheads = aheads, - quantiles = quantiles, + quantile_levels = quantile_levels, nsims = nsims, by_key = by_key, symmetrize = symmetrize, @@ -129,7 +129,7 @@ layer_cdc_flatline_quantiles <- function( layer_cdc_flatline_quantiles_new <- function( aheads, - quantiles, + quantile_levels, nsims, by_key, symmetrize, @@ -138,7 +138,7 @@ layer_cdc_flatline_quantiles_new <- function( layer( "cdc_flatline_quantiles", aheads = aheads, - quantiles = quantiles, + quantile_levels = quantile_levels, nsims = nsims, by_key = by_key, symmetrize = symmetrize, @@ -150,6 +150,7 @@ layer_cdc_flatline_quantiles_new <- function( #' @export slather.layer_cdc_flatline_quantiles <- function(object, components, workflow, new_data, ...) { + if (is.null(object$quantile_levels)) return(components) the_fit <- workflows::extract_fit_parsnip(workflow) if (!inherits(the_fit, "_flatline")) { cli::cli_warn( @@ -213,7 +214,7 @@ slather.layer_cdc_flatline_quantiles <- dplyr::rowwise() %>% dplyr::mutate( .pred_distn_all = propogate_samples( - .resid, .pred, object$quantiles, + .resid, .pred, object$quantile_levels, object$aheads, object$nsim, object$symmetrize, object$nonneg ) ) %>% @@ -229,7 +230,7 @@ slather.layer_cdc_flatline_quantiles <- } propogate_samples <- function( - r, p, quantiles, aheads, nsim, symmetrize, nonneg) { + r, p, quantile_levels, aheads, nsim, symmetrize, nonneg) { max_ahead <- max(aheads) samp <- quantile(r, probs = c(0, seq_len(nsim - 1)) / (nsim - 1), na.rm = TRUE) res <- list() @@ -254,7 +255,7 @@ propogate_samples <- function( list(tibble::tibble( ahead = aheads, .pred_distn = map_vec( - res, ~ dist_quantiles(quantile(.x, quantiles), tau = quantiles) + res, ~ dist_quantiles(quantile(.x, quantile_levels), quantile_levels) ) )) } diff --git a/man/cdc_baseline_args_list.Rd b/man/cdc_baseline_args_list.Rd new file mode 100644 index 000000000..37c326e9c --- /dev/null +++ b/man/cdc_baseline_args_list.Rd @@ -0,0 +1,85 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cdc_baseline_forecaster.R +\name{cdc_baseline_args_list} +\alias{cdc_baseline_args_list} +\title{CDC baseline forecaster argument constructor} +\usage{ +cdc_baseline_args_list( + data_frequency = "1 week", + aheads = 1:4, + n_training = Inf, + forecast_date = NULL, + quantile_levels = c(0.01, 0.025, 1:19/20, 0.975, 0.99), + nsims = 1000L, + symmetrize = TRUE, + nonneg = TRUE, + quantile_by_key = "geo_value", + nafill_buffer = Inf +) +} +\arguments{ +\item{data_frequency}{Integer or string. This describes the frequency of the +input \code{epi_df}. For typical FluSight forecasts, this would be \code{"1 week"}. +Allowable arguments are integers (taken to mean numbers of days) or a +string like \code{"7 days"} or \code{"2 weeks"}. Currently, all other periods +(other than days or weeks) result in an error.} + +\item{aheads}{Integer vector. Unlike \code{\link[=arx_forecaster]{arx_forecaster()}}, this doesn't have +any effect on the predicted values. +Predictions are always the most recent observation. This determines the +set of prediction horizons for \code{\link[=layer_cdc_flatline_quantiles]{layer_cdc_flatline_quantiles()}}\verb{. It interacts with the }data_frequency\verb{argument. So, for example, if the data is daily and you want forecasts for 1:4 days ahead, then you would use}1:4\verb{. However, if you want one-week predictions, you would set this as }c(7, 14, 21, 28)\verb{. But if }data_frequency\code{is}"1 week"\verb{, then you would set it as }1:4`.} + +\item{n_training}{Integer. An upper limit for the number of rows per +key that are used for training +(in the time unit of the \code{epi_df}).} + +\item{forecast_date}{Date. The date on which the forecast is created. +The default \code{NULL} will attempt to determine this automatically.} + +\item{quantile_levels}{Vector or \code{NULL}. A vector of probabilities to produce +prediction intervals. These are created by computing the quantiles of +training residuals. A \code{NULL} value will result in point forecasts only.} + +\item{nsims}{Positive integer. The number of draws from the empirical CDF. +These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting +in linear interpolation on the X scale. This is achieved with +\code{\link[stats:quantile]{stats::quantile()}} Type 7 (the default for that function).} + +\item{symmetrize}{Logical. The default \code{TRUE} calculates +symmetric prediction intervals. This argument only applies when +residual quantiles are used. It is not applicable with +\code{trainer = quantile_reg()}, for example.} + +\item{nonneg}{Logical. Force all predictive intervals be non-negative. +Because non-negativity is forced \emph{before} propagating forward, this +has slightly different behaviour than would occur if using +\code{\link[=layer_threshold_preds]{layer_threshold_preds()}}.} + +\item{quantile_by_key}{Character vector. Groups residuals by listed keys +before calculating residual quantiles. See the \code{by_key} argument to +\code{\link[=layer_residual_quantiles]{layer_residual_quantiles()}} for more information. The default, +\code{character(0)} performs no grouping. This argument only applies when +residual quantiles are used. It is not applicable with +\code{trainer = quantile_reg()}, for example.} + +\item{nafill_buffer}{At predict time, recent values of the training data +are used to create a forecast. However, these can be \code{NA} due to, e.g., +data latency issues. By default, any missing values will get filled with +less recent data. Setting this value to \code{NULL} will result in 1 extra +recent row (beyond those required for lag creation) to be used. Note that +we require at least \code{min(lags)} rows of recent data per \code{geo_value} to +create a prediction. For this reason, setting \code{nafill_buffer < min(lags)} +will be treated as \emph{additional} allowed recent data rather than the +total amount of recent data to examine.} +} +\value{ +A list containing updated parameter choices with class \code{cdc_flat_fcast}. +} +\description{ +Constructs a list of arguments for \code{\link[=cdc_baseline_forecaster]{cdc_baseline_forecaster()}}. +} +\examples{ +cdc_baseline_args_list() +cdc_baseline_args_list(symmetrize = FALSE) +cdc_baseline_args_list(levels = c(.1, .3, .7, .9), n_training = 120) +} diff --git a/man/cdc_baseline_forecaster.Rd b/man/cdc_baseline_forecaster.Rd new file mode 100644 index 000000000..8d0e1b3ec --- /dev/null +++ b/man/cdc_baseline_forecaster.Rd @@ -0,0 +1,73 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cdc_baseline_forecaster.R +\name{cdc_baseline_forecaster} +\alias{cdc_baseline_forecaster} +\title{Predict the future with the most recent value} +\usage{ +cdc_baseline_forecaster( + epi_data, + outcome, + args_list = cdc_baseline_args_list() +) +} +\arguments{ +\item{epi_data}{An \link[epiprocess:epi_df]{epiprocess::epi_df}} + +\item{outcome}{A scalar character for the column name we wish to predict.} + +\item{args_list}{A list of additional arguments as created by the +\code{\link[=cdc_baseline_args_list]{cdc_baseline_args_list()}} constructor function.} +} +\value{ +A data frame of point and interval forecasts at for all +aheads (unique horizons) for each unique combination of \code{key_vars}. +} +\description{ +This is a simple forecasting model for +\link[epiprocess:epi_df]{epiprocess::epi_df} data. It uses the most recent observation as the +forecast for any future date, and produces intervals by shuffling the quantiles +of the residuals of such a "flatline" forecast and incrementing these +forward over all available training data. +} +\details{ +By default, the predictive intervals are computed separately for each +combination of \code{geo_value} in the \code{epi_data} argument. + +This forecaster is meant to produce exactly the CDC Baseline used for +\href{https://covid19forecasthub.org}{COVID19ForecastHub} +} +\examples{ +library(dplyr) +weekly_deaths <- case_death_rate_subset \%>\% + select(geo_value, time_value, death_rate) \%>\% + left_join(state_census \%>\% select(pop, abbr), by = c("geo_value" = "abbr")) \%>\% + mutate(deaths = pmax(death_rate / 1e5 * pop, 0)) \%>\% + select(-pop, -death_rate) \%>\% + group_by(geo_value) \%>\% + epi_slide(~ sum(.$deaths), before = 6, new_col_name = "deaths") \%>\% + ungroup() \%>\% + filter(weekdays(time_value) == "Saturday") + +cdc <- cdc_baseline_forecaster(deaths, "deaths") +preds <- pivot_quantiles(cdc$predictions, .pred_distn) + +if (require(ggplot2)) { +forecast_date <- unique(preds$forecast_date) +four_states <- c("ca", "pa", "wa", "ny") +preds \%>\% + filter(geo_value \%in\% four_states) \%>\% + ggplot(aes(target_date)) + + geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + + geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + + geom_line(aes(y = .pred), color = "orange") + + geom_line( + data = deaths \%>\% filter(geo_value \%in\% four_states), + aes(x = time_value, y = deaths) + ) + + scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + + labs(x = "Date", y = "Weekly deaths") + + facet_wrap(~geo_value, scales = "free_y") + + theme_bw() + + geom_vline(xintercept = forecast_date) +} +} diff --git a/man/layer_cdc_flatline_quantiles.Rd b/man/layer_cdc_flatline_quantiles.Rd index 4f151e854..8eb42b423 100644 --- a/man/layer_cdc_flatline_quantiles.Rd +++ b/man/layer_cdc_flatline_quantiles.Rd @@ -8,7 +8,7 @@ layer_cdc_flatline_quantiles( frosting, ..., aheads = 1:4, - quantiles = c(0.01, 0.025, 1:19/20, 0.975, 0.99), + quantile_levels = c(0.01, 0.025, 1:19/20, 0.975, 0.99), nsims = 1000, by_key = "geo_value", symmetrize = FALSE, @@ -27,10 +27,6 @@ typically observed daily (possibly with missing values), but with weekly forecast targets, you would use \code{c(7, 14, 21, 28)}. But with weekly data, you would use \code{1:4}.} -\item{quantiles}{Numeric vector of probabilities with values in (0,1) -referring to the desired predictive intervals. The default is the standard -set for the COVID Forecast Hub.} - \item{nsims}{Positive integer. The number of draws from the empirical CDF. These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting in linear interpolation on the X scale. This is achieved with @@ -47,6 +43,10 @@ has slightly different behaviour than would occur if using \code{\link[=layer_threshold_preds]{layer_threshold_preds()}}.} \item{id}{a random id string} + +\item{quantiles}{Numeric vector of probabilities with values in (0,1) +referring to the desired predictive intervals. The default is the standard +set for the COVID Forecast Hub.} } \value{ an updated \code{frosting} postprocessor. Calling \code{\link[=predict]{predict()}} will result @@ -105,7 +105,7 @@ preds <- preds \%>\% pivot_quantiles(.pred_distn) \%>\% mutate(target_date = forecast_date + ahead) -library(ggplot2) +if (require("ggplot2")) { four_states <- c("ca", "pa", "wa", "ny") preds \%>\% filter(geo_value \%in\% four_states) \%>\% @@ -122,5 +122,5 @@ preds \%>\% facet_wrap(~geo_value, scales = "free_y") + theme_bw() + geom_vline(xintercept = forecast_date) - +} } diff --git a/man/smooth_quantile_reg.Rd b/man/smooth_quantile_reg.Rd index 6cc2dfc82..b938541f1 100644 --- a/man/smooth_quantile_reg.Rd +++ b/man/smooth_quantile_reg.Rd @@ -74,7 +74,8 @@ abline(v = fd, lty = 2) lines(pl$x, pl$`0.2`, col = "blue") lines(pl$x, pl$`0.8`, col = "blue") lines(pl$x, pl$`0.5`, col = "red") -\dontrun{ + +if (require("ggplot2")) { ggplot(data.frame(x = x, y = y), aes(x)) + geom_ribbon(data = pl, aes(ymin = `0.2`, ymax = `0.8`), fill = "lightblue") + geom_point(aes(y = y), colour = "grey") + # observed data diff --git a/tests/testthat/test-parse_period.R b/tests/testthat/test-parse_period.R new file mode 100644 index 000000000..0adbcec3d --- /dev/null +++ b/tests/testthat/test-parse_period.R @@ -0,0 +1,12 @@ +test_that("parse_period works", { + expect_error(parse_period(c(1, 2))) + expect_error(parse_period(c(1.3))) + expect_error(parse_period("1 year")) + expect_error(parse_period("2 weeks later")) + expect_identical(parse_period(1), 1L) + expect_identical(parse_period("1 day"), 1L) + expect_identical(parse_period("1 days"), 1L) + expect_identical(parse_period("1 week"), 7L) + expect_identical(parse_period("1 weeks"), 7L) + expect_identical(parse_period("2 weeks"), 14L) +}) From d59a691d2eea0a7ae7606464120e18e0b8e22bc3 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Wed, 4 Oct 2023 12:32:16 -0700 Subject: [PATCH 15/22] add cdc baseline to pkgdown --- _pkgdown.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/_pkgdown.yml b/_pkgdown.yml index 2ad03c277..89fd733e1 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -34,6 +34,7 @@ reference: contents: - contains("flatline") - contains("arx") + - contains("cdc") - title: Parsnip engines desc: Prediction methods not available elsewhere contents: From 21b4c85340972e0005af1bc40fd1e0bb4763a9ed Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Wed, 4 Oct 2023 13:27:02 -0700 Subject: [PATCH 16/22] local checks pass --- .Rbuildignore | 1 + NAMESPACE | 1 + R/cdc_baseline_forecaster.R | 8 ++++---- R/layer_cdc_flatline_quantiles.R | 4 ++-- R/make_smooth_quantile_reg.R | 2 +- man/cdc_baseline_args_list.Rd | 4 ++-- man/cdc_baseline_forecaster.Rd | 4 ++-- man/layer_cdc_flatline_quantiles.Rd | 10 +++++----- 8 files changed, 18 insertions(+), 16 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index 5139bcabe..cb36bb9d2 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -12,3 +12,4 @@ ^musings$ ^data-raw$ ^vignettes/articles$ +^.git-blame-ignore-revs$ diff --git a/NAMESPACE b/NAMESPACE index d7e941b0f..21cd7b83f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -186,6 +186,7 @@ importFrom(rlang,caller_env) importFrom(rlang,is_empty) importFrom(rlang,is_null) importFrom(rlang,quos) +importFrom(smoothqr,smooth_qr) importFrom(stats,as.formula) importFrom(stats,family) importFrom(stats,lm) diff --git a/R/cdc_baseline_forecaster.R b/R/cdc_baseline_forecaster.R index f5961431d..f1014d666 100644 --- a/R/cdc_baseline_forecaster.R +++ b/R/cdc_baseline_forecaster.R @@ -33,7 +33,7 @@ #' ungroup() %>% #' filter(weekdays(time_value) == "Saturday") #' -#' cdc <- cdc_baseline_forecaster(deaths, "deaths") +#' cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths") #' preds <- pivot_quantiles(cdc$predictions, .pred_distn) #' #' if (require(ggplot2)) { @@ -46,7 +46,7 @@ #' geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + #' geom_line(aes(y = .pred), color = "orange") + #' geom_line( -#' data = deaths %>% filter(geo_value %in% four_states), +#' data = weekly_deaths %>% filter(geo_value %in% four_states), #' aes(x = time_value, y = deaths) #' ) + #' scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + @@ -150,7 +150,7 @@ cdc_baseline_forecaster <- function( #' @param nonneg Logical. Force all predictive intervals be non-negative. #' Because non-negativity is forced _before_ propagating forward, this #' has slightly different behaviour than would occur if using -#' [layer_threshold_preds()]. +#' [layer_threshold()]. #' #' @return A list containing updated parameter choices with class `cdc_flat_fcast`. #' @export @@ -158,7 +158,7 @@ cdc_baseline_forecaster <- function( #' @examples #' cdc_baseline_args_list() #' cdc_baseline_args_list(symmetrize = FALSE) -#' cdc_baseline_args_list(levels = c(.1, .3, .7, .9), n_training = 120) +#' cdc_baseline_args_list(quantile_levels = c(.1, .3, .7, .9), n_training = 120) cdc_baseline_args_list <- function( data_frequency = "1 week", aheads = 1:4, diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index aa953ad45..f1a159b9a 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -25,7 +25,7 @@ #' typically observed daily (possibly with missing values), but #' with weekly forecast targets, you would use `c(7, 14, 21, 28)`. But with #' weekly data, you would use `1:4`. -#' @param quantiles Numeric vector of probabilities with values in (0,1) +#' @param quantile_levels Numeric vector of probabilities with values in (0,1) #' referring to the desired predictive intervals. The default is the standard #' set for the COVID Forecast Hub. #' @param nsims Positive integer. The number of draws from the empirical CDF. @@ -35,7 +35,7 @@ #' @param nonneg Logical. Force all predictive intervals be non-negative. #' Because non-negativity is forced _before_ propagating forward, this #' has slightly different behaviour than would occur if using -#' [layer_threshold_preds()]. +#' [layer_threshold()]. #' #' @return an updated `frosting` postprocessor. Calling [predict()] will result #' in an additional `` named `.pred_distn_all` containing 2-column diff --git a/R/make_smooth_quantile_reg.R b/R/make_smooth_quantile_reg.R index cfb08a9c7..7d25a8c6b 100644 --- a/R/make_smooth_quantile_reg.R +++ b/R/make_smooth_quantile_reg.R @@ -21,7 +21,7 @@ #' #' @seealso [fit.model_spec()], [set_engine()] #' -#' @importFrom quantreg rq +#' @importFrom smoothqr smooth_qr #' @examples #' tib <- data.frame( #' y1 = rnorm(100), y2 = rnorm(100), y3 = rnorm(100), diff --git a/man/cdc_baseline_args_list.Rd b/man/cdc_baseline_args_list.Rd index 37c326e9c..2f6546f74 100644 --- a/man/cdc_baseline_args_list.Rd +++ b/man/cdc_baseline_args_list.Rd @@ -53,7 +53,7 @@ residual quantiles are used. It is not applicable with \item{nonneg}{Logical. Force all predictive intervals be non-negative. Because non-negativity is forced \emph{before} propagating forward, this has slightly different behaviour than would occur if using -\code{\link[=layer_threshold_preds]{layer_threshold_preds()}}.} +\code{\link[=layer_threshold]{layer_threshold()}}.} \item{quantile_by_key}{Character vector. Groups residuals by listed keys before calculating residual quantiles. See the \code{by_key} argument to @@ -81,5 +81,5 @@ Constructs a list of arguments for \code{\link[=cdc_baseline_forecaster]{cdc_bas \examples{ cdc_baseline_args_list() cdc_baseline_args_list(symmetrize = FALSE) -cdc_baseline_args_list(levels = c(.1, .3, .7, .9), n_training = 120) +cdc_baseline_args_list(quantile_levels = c(.1, .3, .7, .9), n_training = 120) } diff --git a/man/cdc_baseline_forecaster.Rd b/man/cdc_baseline_forecaster.Rd index 8d0e1b3ec..65de1c29e 100644 --- a/man/cdc_baseline_forecaster.Rd +++ b/man/cdc_baseline_forecaster.Rd @@ -48,7 +48,7 @@ weekly_deaths <- case_death_rate_subset \%>\% ungroup() \%>\% filter(weekdays(time_value) == "Saturday") -cdc <- cdc_baseline_forecaster(deaths, "deaths") +cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths") preds <- pivot_quantiles(cdc$predictions, .pred_distn) if (require(ggplot2)) { @@ -61,7 +61,7 @@ preds \%>\% geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + geom_line(aes(y = .pred), color = "orange") + geom_line( - data = deaths \%>\% filter(geo_value \%in\% four_states), + data = weekly_deaths \%>\% filter(geo_value \%in\% four_states), aes(x = time_value, y = deaths) ) + scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + diff --git a/man/layer_cdc_flatline_quantiles.Rd b/man/layer_cdc_flatline_quantiles.Rd index 8eb42b423..71f414e25 100644 --- a/man/layer_cdc_flatline_quantiles.Rd +++ b/man/layer_cdc_flatline_quantiles.Rd @@ -27,6 +27,10 @@ typically observed daily (possibly with missing values), but with weekly forecast targets, you would use \code{c(7, 14, 21, 28)}. But with weekly data, you would use \code{1:4}.} +\item{quantile_levels}{Numeric vector of probabilities with values in (0,1) +referring to the desired predictive intervals. The default is the standard +set for the COVID Forecast Hub.} + \item{nsims}{Positive integer. The number of draws from the empirical CDF. These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting in linear interpolation on the X scale. This is achieved with @@ -40,13 +44,9 @@ calculating quantiles. The default, \code{c()} performs no grouping.} \item{nonneg}{Logical. Force all predictive intervals be non-negative. Because non-negativity is forced \emph{before} propagating forward, this has slightly different behaviour than would occur if using -\code{\link[=layer_threshold_preds]{layer_threshold_preds()}}.} +\code{\link[=layer_threshold]{layer_threshold()}}.} \item{id}{a random id string} - -\item{quantiles}{Numeric vector of probabilities with values in (0,1) -referring to the desired predictive intervals. The default is the standard -set for the COVID Forecast Hub.} } \value{ an updated \code{frosting} postprocessor. Calling \code{\link[=predict]{predict()}} will result From 965155de483c0a13c9c5b76e93bd958f8295d913 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 25 Sep 2023 12:33:18 -0700 Subject: [PATCH 17/22] finish quantile pivotting helpers, redocument --- NAMESPACE | 3 +- NEWS.md | 2 +- R/dist_quantiles.R | 87 ---------- R/pivot_quantiles.R | 157 ++++++++++++++++++ _pkgdown.yml | 2 +- man/nested_quantiles.Rd | 2 +- man/pivot_quantiles_longer.Rd | 42 +++++ ..._quantiles.Rd => pivot_quantiles_wider.Rd} | 16 +- tests/testthat/test-pivot_quantiles.R | 58 ++++++- 9 files changed, 262 insertions(+), 107 deletions(-) create mode 100644 R/pivot_quantiles.R create mode 100644 man/pivot_quantiles_longer.Rd rename man/{pivot_quantiles.Rd => pivot_quantiles_wider.Rd} (75%) diff --git a/NAMESPACE b/NAMESPACE index 21cd7b83f..ec783a737 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -148,7 +148,8 @@ export(layer_unnest) export(nested_quantiles) export(new_default_epi_recipe_blueprint) export(new_epi_recipe_blueprint) -export(pivot_quantiles) +export(pivot_quantiles_longer) +export(pivot_quantiles_wider) export(prep) export(quantile_reg) export(remove_frosting) diff --git a/NEWS.md b/NEWS.md index fa99c8bcd..12442639b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -7,7 +7,7 @@ * canned forecasters get a class * fixed quantile bug in `flatline_forecaster()` * add functionality to output the unfit workflow from the canned forecasters -* add `pivot_quantiles()` for easier plotting +* add `pivot_quantiles_wider()` for easier plotting # epipredict 0.0.4 diff --git a/R/dist_quantiles.R b/R/dist_quantiles.R index 032a4d96c..ff14d6733 100644 --- a/R/dist_quantiles.R +++ b/R/dist_quantiles.R @@ -116,93 +116,6 @@ is_dist_quantiles <- function(x) { } -#' Turn a vector of quantile distributions into a list-col -#' -#' @param x a `distribution` containing `dist_quantiles` -#' -#' @return a list-col -#' @export -#' -#' @examples -#' edf <- case_death_rate_subset[1:3, ] -#' edf$q <- dist_quantiles(list(1:5, 2:4, 3:10), list(1:5 / 6, 2:4 / 5, 3:10 / 11)) -#' -#' edf_nested <- edf %>% dplyr::mutate(q = nested_quantiles(q)) -#' edf_nested %>% tidyr::unnest(q) -nested_quantiles <- function(x) { - stopifnot(is_dist_quantiles(x)) - distributional:::dist_apply(x, .f = function(z) { - tibble::as_tibble(vec_data(z)) %>% - dplyr::mutate(dplyr::across(tidyselect::everything(), as.double)) %>% - list_of() - }) -} - - -#' Pivot columns containing `dist_quantile` wider -#' -#' Any selected columns that contain `dist_quantiles` will be "widened" with -#' the "taus" (quantile) serving as names and the values in the data frame. -#' When pivoting multiple columns, the original column name will be used as -#' a prefix. -#' -#' @param .data A data frame, or a data frame extension such as a tibble or -#' epi_df. -#' @param ... <[`tidy-select`][dplyr::dplyr_tidy_select]> One or more unquoted -#' expressions separated by commas. Variable names can be used as if they -#' were positions in the data frame, so expressions like `x:y` can -#' be used to select a range of variables. Any selected columns should -#' -#' @return An object of the same class as `.data` -#' @export -#' -#' @examples -#' d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) -#' d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) -#' tib <- tibble::tibble(g = c("a", "b"), d1 = d1, d2 = d2) -#' -#' pivot_quantiles(tib, c("d1", "d2")) -#' pivot_quantiles(tib, tidyselect::starts_with("d")) -#' pivot_quantiles(tib, d2) -pivot_quantiles <- function(.data, ...) { - expr <- rlang::expr(c(...)) - cols <- names(tidyselect::eval_select(expr, .data)) - dqs <- map_lgl(cols, ~ is_dist_quantiles(.data[[.x]])) - if (!all(dqs)) { - nms <- cols[!dqs] - cli::cli_abort( - "Variables(s) {.var {nms}} are not `dist_quantiles`. Cannot pivot them." - ) - } - .data <- .data %>% - dplyr::mutate(dplyr::across(tidyselect::all_of(cols), nested_quantiles)) - checks <- map_lgl(cols, ~ diff(range(vctrs::list_sizes(.data[[.x]]))) == 0L) - if (!all(checks)) { - nms <- cols[!checks] - cli::cli_abort( - c("Quantiles must be the same length and have the same set of taus.", - i = "Check failed for variables(s) {.var {nms}}." - ) - ) - } - if (length(cols) > 1L) { - for (col in cols) { - .data <- .data %>% - tidyr::unnest(tidyselect::all_of(col)) %>% - tidyr::pivot_wider( - names_from = "tau", values_from = "q", - names_prefix = paste0(col, "_") - ) - } - } else { - .data <- .data %>% - tidyr::unnest(tidyselect::all_of(cols)) %>% - tidyr::pivot_wider(names_from = "tau", values_from = "q") - } - .data -} - - #' @export diff --git a/R/pivot_quantiles.R b/R/pivot_quantiles.R new file mode 100644 index 000000000..94bfde521 --- /dev/null +++ b/R/pivot_quantiles.R @@ -0,0 +1,157 @@ +#' Turn a vector of quantile distributions into a list-col +#' +#' @param x a `distribution` containing `dist_quantiles` +#' +#' @return a list-col +#' @export +#' +#' @examples +#' edf <- case_death_rate_subset[1:3, ] +#' edf$q <- dist_quantiles(list(1:5, 2:4, 3:10), list(1:5 / 6, 2:4 / 5, 3:10 / 11)) +#' +#' edf_nested <- edf %>% dplyr::mutate(q = nested_quantiles(q)) +#' edf_nested %>% tidyr::unnest(q) +nested_quantiles <- function(x) { + stopifnot(is_dist_quantiles(x)) + distributional:::dist_apply(x, .f = function(z) { + tibble::as_tibble(vec_data(z)) %>% + dplyr::mutate(dplyr::across(tidyselect::everything(), as.double)) %>% + list_of() + }) +} + + +#' Pivot columns containing `dist_quantile` longer +#' +#' Selected columns that contains `dist_quantiles` will be "lengthened" with +#' the "taus" (quantile) serving as 1 column and the values as another. If +#' multiple columns are selected, these will be prefixed the the column name. +#' +#' @param .data A data frame, or a data frame extension such as a tibble or +#' epi_df. +#' @param ... <[`tidy-select`][dplyr::dplyr_tidy_select]> One or more unquoted +#' expressions separated by commas. Variable names can be used as if they +#' were positions in the data frame, so expressions like `x:y` can +#' be used to select a range of variables. +#' @param .ignore_length_check If multiple columns are selected, as long as +#' each row has contains the same number of quantiles, the result will be +#' reasonable. But if, for example, `var1[1]` has 5 quantiles while `var2[1]` +#' has 7, then the only option would be to recycle everything, creating a +#' _very_ long result. By default, this would throw an error. But if this is +#' really the goal, then the error can be bypassed by setting this argument +#' to `TRUE`. The first selected column will vary fastest. +#' +#' @return An object of the same class as `.data`. +#' @export +#' +#' @examples +#' d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) +#' d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) +#' tib <- tibble::tibble(g = c("a", "b"), d1 = d1, d2 = d2) +#' +#' pivot_quantiles_longer(tib, "d1") +#' pivot_quantiles_longer(tib, tidyselect::ends_with("1")) +#' pivot_quantiles_longer(tib, d1, d2) +pivot_quantiles_longer <- function(.data, ..., .ignore_length_check = FALSE) { + cols <- validate_pivot_quantiles(.data, ...) + .data <- .data %>% + dplyr::mutate(dplyr::across(tidyselect::all_of(cols), nested_quantiles)) + if (length(cols) > 1L) { + lengths_check <- .data %>% + dplyr::transmute(dplyr::across( + tidyselect::all_of(cols), + ~ map_int(.x, vctrs::vec_size) + )) %>% + as.matrix() %>% + apply(1, function(x) dplyr::n_distinct(x) == 1L) %>% + all() + if (lengths_check) { + .data <- tidyr::unnest(.data, tidyselect::all_of(cols), names_sep = "_") + } else { + if (.ignore_length_check) { + for (col in cols) { + .data <- .data %>% + tidyr::unnest(tidyselect::all_of(col), names_sep = "_") + } + } else { + cli::cli_abort(c( + "Some selected columns contain different numbers of quantiles.", + "The result would be a {.emph very} long {.cls tibble}.", + "To do this anyway, rerun with `.ignore_length_check = TRUE`." + )) + } + } + } else { + .data <- .data %>% tidyr::unnest(tidyselect::all_of(cols)) + } + .data +} + +#' Pivot columns containing `dist_quantile` wider +#' +#' Any selected columns that contain `dist_quantiles` will be "widened" with +#' the "taus" (quantile) serving as names and the values in the data frame. +#' When pivoting multiple columns, the original column name will be used as +#' a prefix. +#' +#' @param .data A data frame, or a data frame extension such as a tibble or +#' epi_df. +#' @param ... <[`tidy-select`][dplyr::dplyr_tidy_select]> One or more unquoted +#' expressions separated by commas. Variable names can be used as if they +#' were positions in the data frame, so expressions like `x:y` can +#' be used to select a range of variables. +#' +#' @return An object of the same class as `.data` +#' @export +#' +#' @examples +#' d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) +#' d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) +#' tib <- tibble::tibble(g = c("a", "b"), d1 = d1, d2 = d2) +#' +#' pivot_quantiles_wider(tib, c("d1", "d2")) +#' pivot_quantiles_wider(tib, tidyselect::starts_with("d")) +#' pivot_quantiles_wider(tib, d2) +pivot_quantiles_wider <- function(.data, ...) { + cols <- validate_pivot_quantiles(.data, ...) + .data <- .data %>% + dplyr::mutate(dplyr::across(tidyselect::all_of(cols), nested_quantiles)) + checks <- map_lgl(cols, ~ diff(range(vctrs::list_sizes(.data[[.x]]))) == 0L) + if (!all(checks)) { + nms <- cols[!checks] + cli::cli_abort( + c("Quantiles must be the same length and have the same set of taus.", + i = "Check failed for variables(s) {.var {nms}}." + ) + ) + } + if (length(cols) > 1L) { + for (col in cols) { + .data <- .data %>% + tidyr::unnest(tidyselect::all_of(col)) %>% + tidyr::pivot_wider( + names_from = "tau", values_from = "q", + names_prefix = paste0(col, "_") + ) + } + } else { + .data <- .data %>% + tidyr::unnest(tidyselect::all_of(cols)) %>% + tidyr::pivot_wider(names_from = "tau", values_from = "q") + } + .data +} + + +validate_pivot_quantiles <- function(.data, ...) { + expr <- rlang::expr(c(...)) + cols <- names(tidyselect::eval_select(expr, .data)) + dqs <- map_lgl(cols, ~ is_dist_quantiles(.data[[.x]])) + if (!all(dqs)) { + nms <- cols[!dqs] + cli::cli_abort( + "Variables(s) {.var {nms}} are not `dist_quantiles`. Cannot pivot them." + ) + } + cols +} diff --git a/_pkgdown.yml b/_pkgdown.yml index 89fd733e1..ac66b8208 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -68,7 +68,7 @@ reference: - dist_quantiles - extrapolate_quantiles - nested_quantiles - - pivot_quantiles + - starts_with("pivot_quantiles") - title: Included datasets contents: - case_death_rate_subset diff --git a/man/nested_quantiles.Rd b/man/nested_quantiles.Rd index c4b578c1a..143532650 100644 --- a/man/nested_quantiles.Rd +++ b/man/nested_quantiles.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dist_quantiles.R +% Please edit documentation in R/pivot_quantiles.R \name{nested_quantiles} \alias{nested_quantiles} \title{Turn a vector of quantile distributions into a list-col} diff --git a/man/pivot_quantiles_longer.Rd b/man/pivot_quantiles_longer.Rd new file mode 100644 index 000000000..f29f27cd2 --- /dev/null +++ b/man/pivot_quantiles_longer.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pivot_quantiles.R +\name{pivot_quantiles_longer} +\alias{pivot_quantiles_longer} +\title{Pivot columns containing \code{dist_quantile} longer} +\usage{ +pivot_quantiles_longer(.data, ..., .ignore_length_check = FALSE) +} +\arguments{ +\item{.data}{A data frame, or a data frame extension such as a tibble or +epi_df.} + +\item{...}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> One or more unquoted +expressions separated by commas. Variable names can be used as if they +were positions in the data frame, so expressions like \code{x:y} can +be used to select a range of variables.} + +\item{.ignore_length_check}{If multiple columns are selected, as long as +each row has contains the same number of quantiles, the result will be +reasonable. But if, for example, \code{var1[1]} has 5 quantiles while \code{var2[1]} +has 7, then the only option would be to recycle everything, creating a +\emph{very} long result. By default, this would throw an error. But if this is +really the goal, then the error can be bypassed by setting this argument +to \code{TRUE}.} +} +\value{ +An object of the same class as \code{.data}. +} +\description{ +Selected columns that contains \code{dist_quantiles} will be "lengthened" with +the "taus" (quantile) serving as 1 column and the values as another. If +multiple columns are selected, these will be prefixed the the column name. +} +\examples{ +d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) +d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) +tib <- tibble::tibble(g = c("a", "b"), d1 = d1, d2 = d2) + +pivot_quantiles_longer(tib, "d1") +pivot_quantiles_longer(tib, tidyselect::ends_with("1")) +pivot_quantiles_longer(tib, d1, d2) +} diff --git a/man/pivot_quantiles.Rd b/man/pivot_quantiles_wider.Rd similarity index 75% rename from man/pivot_quantiles.Rd rename to man/pivot_quantiles_wider.Rd index 0ed6588ed..02a33bb2f 100644 --- a/man/pivot_quantiles.Rd +++ b/man/pivot_quantiles_wider.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dist_quantiles.R -\name{pivot_quantiles} -\alias{pivot_quantiles} +% Please edit documentation in R/pivot_quantiles.R +\name{pivot_quantiles_wider} +\alias{pivot_quantiles_wider} \title{Pivot columns containing \code{dist_quantile} wider} \usage{ -pivot_quantiles(.data, ...) +pivot_quantiles_wider(.data, ...) } \arguments{ \item{.data}{A data frame, or a data frame extension such as a tibble or @@ -13,7 +13,7 @@ epi_df.} \item{...}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> One or more unquoted expressions separated by commas. Variable names can be used as if they were positions in the data frame, so expressions like \code{x:y} can -be used to select a range of variables. Any selected columns should} +be used to select a range of variables.} } \value{ An object of the same class as \code{.data} @@ -29,7 +29,7 @@ d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) tib <- tibble::tibble(g = c("a", "b"), d1 = d1, d2 = d2) -pivot_quantiles(tib, c("d1", "d2")) -pivot_quantiles(tib, tidyselect::starts_with("d")) -pivot_quantiles(tib, d2) +pivot_quantiles_wider(tib, c("d1", "d2")) +pivot_quantiles_wider(tib, tidyselect::starts_with("d")) +pivot_quantiles_wider(tib, d2) } diff --git a/tests/testthat/test-pivot_quantiles.R b/tests/testthat/test-pivot_quantiles.R index 85694aace..cdf84f28d 100644 --- a/tests/testthat/test-pivot_quantiles.R +++ b/tests/testthat/test-pivot_quantiles.R @@ -1,26 +1,68 @@ -test_that("quantile pivotting behaves", { +test_that("quantile pivotting wider behaves", { tib <- tibble::tibble(a = 1:5, b = 6:10) - expect_error(pivot_quantiles(tib, a)) + expect_error(pivot_quantiles_wider(tib, a)) tib$c <- rep(dist_normal(), 5) - expect_error(pivot_quantiles(tib, c)) + expect_error(pivot_quantiles_wider(tib, c)) d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:5, 1:4 / 5)) # different quantiles tib <- tib[1:2, ] tib$d1 <- d1 - expect_error(pivot_quantiles(tib, d1)) + expect_error(pivot_quantiles_wider(tib, d1)) d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 2:4 / 4)) tib$d1 <- d1 # would want to error (mismatched quantiles), but hard to check efficiently - expect_silent(pivot_quantiles(tib, d1)) + expect_silent(pivot_quantiles_wider(tib, d1)) d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) tib <- tibble::tibble(g = c("a", "b"), d1 = d1, d2 = d2) - expect_length(pivot_quantiles(tib, c("d1", "d2")), 7L) - expect_length(pivot_quantiles(tib, tidyselect::starts_with("d")), 7L) - expect_length(pivot_quantiles(tib, d2), 5L) + expect_length(pivot_quantiles_wider(tib, c("d1", "d2")), 7L) + expect_length(pivot_quantiles_wider(tib, tidyselect::starts_with("d")), 7L) + expect_length(pivot_quantiles_wider(tib, d2), 5L) +}) + + +test_that("quantile pivotting longer behaves", { + tib <- tibble::tibble(a = 1:5, b = 6:10) + expect_error(pivot_quantiles_longer(tib, a)) + tib$c <- rep(dist_normal(), 5) + expect_error(pivot_quantiles_longer(tib, c)) + + d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:5, 1:4 / 5)) + # different quantiles + tib <- tib[1:2, ] + tib$d1 <- d1 + expect_length(pivot_quantiles_longer(tib, d1), 5L) + expect_identical(nrow(pivot_quantiles_longer(tib, d1)), 7L) + expect_identical(pivot_quantiles_longer(tib, d1)$q, as.double(c(1:3, 2:5))) + + d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 2:4 / 4)) + tib$d1 <- d1 + expect_silent(pivot_quantiles_longer(tib, d1)) + + d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) + d2 <- c(dist_quantiles(2:4, 2:4 / 5), dist_quantiles(3:5, 2:4 / 5)) + tib <- tibble::tibble(g = c("a", "b"), d1 = d1, d2 = d2) + + + expect_length(pivot_quantiles_longer(tib, c("d1", "d2")), 5L) + expect_identical(nrow(pivot_quantiles_longer(tib, c("d1", "d2"))), 6L) + expect_silent(pivot_quantiles_longer(tib, tidyselect::starts_with("d"))) + expect_length(pivot_quantiles_longer(tib, d2), 5L) + + tib$d3 <- c(dist_quantiles(2:5, 2:5 / 6), dist_quantiles(3:6, 2:5 / 6)) + # now the cols have different numbers of quantiles + expect_error(pivot_quantiles_longer(tib, d1, d3)) + expect_length( + pivot_quantiles_longer(tib, d1, d3, .ignore_length_check = TRUE), + 6L + ) + expect_identical( + pivot_quantiles_longer(tib, d1, d3, .ignore_length_check = TRUE)$d1_q, + as.double(rep(c(1:3, 2:4), each = 4)) + ) }) From 1cf5dff5db4399c0fd5d57dabf636207c01f5ca4 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 25 Sep 2023 12:46:50 -0700 Subject: [PATCH 18/22] fix extra check note. --- man/pivot_quantiles_longer.Rd | 2 +- tests/testthat/test-pivot_quantiles.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/man/pivot_quantiles_longer.Rd b/man/pivot_quantiles_longer.Rd index f29f27cd2..1cb6f5165 100644 --- a/man/pivot_quantiles_longer.Rd +++ b/man/pivot_quantiles_longer.Rd @@ -21,7 +21,7 @@ reasonable. But if, for example, \code{var1[1]} has 5 quantiles while \code{var2 has 7, then the only option would be to recycle everything, creating a \emph{very} long result. By default, this would throw an error. But if this is really the goal, then the error can be bypassed by setting this argument -to \code{TRUE}.} +to \code{TRUE}. The first selected column will vary fastest.} } \value{ An object of the same class as \code{.data}. diff --git a/tests/testthat/test-pivot_quantiles.R b/tests/testthat/test-pivot_quantiles.R index cdf84f28d..9928c5e09 100644 --- a/tests/testthat/test-pivot_quantiles.R +++ b/tests/testthat/test-pivot_quantiles.R @@ -52,7 +52,7 @@ test_that("quantile pivotting longer behaves", { expect_length(pivot_quantiles_longer(tib, c("d1", "d2")), 5L) expect_identical(nrow(pivot_quantiles_longer(tib, c("d1", "d2"))), 6L) expect_silent(pivot_quantiles_longer(tib, tidyselect::starts_with("d"))) - expect_length(pivot_quantiles_longer(tib, d2), 5L) + expect_length(pivot_quantiles_longer(tib, d2), 4L) tib$d3 <- c(dist_quantiles(2:5, 2:5 / 6), dist_quantiles(3:6, 2:5 / 6)) # now the cols have different numbers of quantiles From 1458ab0c204e3980e806071efa6d4108805ed3bd Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Fri, 29 Sep 2023 16:21:37 -0700 Subject: [PATCH 19/22] add lifecycle, deprecate pivot_quantiles. --- DESCRIPTION | 1 + NAMESPACE | 1 + R/epipredict-package.R | 3 +++ R/pivot_quantiles.R | 3 +++ man/figures/lifecycle-archived.svg | 21 ++++++++++++++++ man/figures/lifecycle-defunct.svg | 21 ++++++++++++++++ man/figures/lifecycle-deprecated.svg | 21 ++++++++++++++++ man/figures/lifecycle-experimental.svg | 21 ++++++++++++++++ man/figures/lifecycle-maturing.svg | 21 ++++++++++++++++ man/figures/lifecycle-questioning.svg | 21 ++++++++++++++++ man/figures/lifecycle-soft-deprecated.svg | 21 ++++++++++++++++ man/figures/lifecycle-stable.svg | 29 +++++++++++++++++++++++ man/figures/lifecycle-superseded.svg | 21 ++++++++++++++++ 13 files changed, 205 insertions(+) create mode 100644 man/figures/lifecycle-archived.svg create mode 100644 man/figures/lifecycle-defunct.svg create mode 100644 man/figures/lifecycle-deprecated.svg create mode 100644 man/figures/lifecycle-experimental.svg create mode 100644 man/figures/lifecycle-maturing.svg create mode 100644 man/figures/lifecycle-questioning.svg create mode 100644 man/figures/lifecycle-soft-deprecated.svg create mode 100644 man/figures/lifecycle-stable.svg create mode 100644 man/figures/lifecycle-superseded.svg diff --git a/DESCRIPTION b/DESCRIPTION index 75602f072..eb6405df4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -32,6 +32,7 @@ Imports: generics, glue, hardhat (>= 1.3.0), + lifecycle, magrittr, methods, quantreg, diff --git a/NAMESPACE b/NAMESPACE index ec783a737..a24e5844b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -173,6 +173,7 @@ importFrom(generics,augment) importFrom(generics,fit) importFrom(hardhat,refresh_blueprint) importFrom(hardhat,run_mold) +importFrom(lifecycle,deprecated) importFrom(magrittr,"%>%") importFrom(methods,is) importFrom(quantreg,rq) diff --git a/R/epipredict-package.R b/R/epipredict-package.R index 51478065b..da4991feb 100644 --- a/R/epipredict-package.R +++ b/R/epipredict-package.R @@ -1,5 +1,8 @@ +## usethis namespace: start #' @importFrom tibble tibble #' @importFrom rlang := !! #' @importFrom stats poly predict lm residuals quantile +#' @importFrom lifecycle deprecated #' @import epiprocess parsnip +## usethis namespace: end NULL diff --git a/R/pivot_quantiles.R b/R/pivot_quantiles.R index 94bfde521..de4aa1e01 100644 --- a/R/pivot_quantiles.R +++ b/R/pivot_quantiles.R @@ -142,6 +142,9 @@ pivot_quantiles_wider <- function(.data, ...) { .data } +pivot_quantiles <- function(.data, ...) { + lifecycle::deprecate_stop("0.0.6", "pivot_quantiles()", "pivot_quantiles_wider()") +} validate_pivot_quantiles <- function(.data, ...) { expr <- rlang::expr(c(...)) diff --git a/man/figures/lifecycle-archived.svg b/man/figures/lifecycle-archived.svg new file mode 100644 index 000000000..745ab0c78 --- /dev/null +++ b/man/figures/lifecycle-archived.svg @@ -0,0 +1,21 @@ + + lifecycle: archived + + + + + + + + + + + + + + + lifecycle + + archived + + diff --git a/man/figures/lifecycle-defunct.svg b/man/figures/lifecycle-defunct.svg new file mode 100644 index 000000000..d5c9559ed --- /dev/null +++ b/man/figures/lifecycle-defunct.svg @@ -0,0 +1,21 @@ + + lifecycle: defunct + + + + + + + + + + + + + + + lifecycle + + defunct + + diff --git a/man/figures/lifecycle-deprecated.svg b/man/figures/lifecycle-deprecated.svg new file mode 100644 index 000000000..b61c57c3f --- /dev/null +++ b/man/figures/lifecycle-deprecated.svg @@ -0,0 +1,21 @@ + + lifecycle: deprecated + + + + + + + + + + + + + + + lifecycle + + deprecated + + diff --git a/man/figures/lifecycle-experimental.svg b/man/figures/lifecycle-experimental.svg new file mode 100644 index 000000000..5d88fc2c6 --- /dev/null +++ b/man/figures/lifecycle-experimental.svg @@ -0,0 +1,21 @@ + + lifecycle: experimental + + + + + + + + + + + + + + + lifecycle + + experimental + + diff --git a/man/figures/lifecycle-maturing.svg b/man/figures/lifecycle-maturing.svg new file mode 100644 index 000000000..897370ecf --- /dev/null +++ b/man/figures/lifecycle-maturing.svg @@ -0,0 +1,21 @@ + + lifecycle: maturing + + + + + + + + + + + + + + + lifecycle + + maturing + + diff --git a/man/figures/lifecycle-questioning.svg b/man/figures/lifecycle-questioning.svg new file mode 100644 index 000000000..7c1721d05 --- /dev/null +++ b/man/figures/lifecycle-questioning.svg @@ -0,0 +1,21 @@ + + lifecycle: questioning + + + + + + + + + + + + + + + lifecycle + + questioning + + diff --git a/man/figures/lifecycle-soft-deprecated.svg b/man/figures/lifecycle-soft-deprecated.svg new file mode 100644 index 000000000..9c166ff30 --- /dev/null +++ b/man/figures/lifecycle-soft-deprecated.svg @@ -0,0 +1,21 @@ + + lifecycle: soft-deprecated + + + + + + + + + + + + + + + lifecycle + + soft-deprecated + + diff --git a/man/figures/lifecycle-stable.svg b/man/figures/lifecycle-stable.svg new file mode 100644 index 000000000..9bf21e76b --- /dev/null +++ b/man/figures/lifecycle-stable.svg @@ -0,0 +1,29 @@ + + lifecycle: stable + + + + + + + + + + + + + + + + lifecycle + + + + stable + + + diff --git a/man/figures/lifecycle-superseded.svg b/man/figures/lifecycle-superseded.svg new file mode 100644 index 000000000..db8d757f7 --- /dev/null +++ b/man/figures/lifecycle-superseded.svg @@ -0,0 +1,21 @@ + + lifecycle: superseded + + + + + + + + + + + + + + + lifecycle + + superseded + + From 7358c1350a7de1ffff844b5c68a70d251e03d2c0 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 5 Oct 2023 11:59:56 -0700 Subject: [PATCH 20/22] CI: also needs to be on the branch --- .github/workflows/R-CMD-check.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index c4bcd6b68..eff7367ec 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -4,9 +4,9 @@ # Created with usethis + edited to use API key. on: push: - branches: [main, master] + branches: [main, master, v0.0.6] pull_request: - branches: [main, master] + branches: [main, master, v0.0.6] name: R-CMD-check From 169f7641334b61433079a1fb31ce9d909094b344 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Thu, 5 Oct 2023 15:52:27 -0700 Subject: [PATCH 21/22] address @nmdefries comments --- NEWS.md | 1 + R/pivot_quantiles.R | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 12442639b..cba55a67d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,7 @@ * fixed quantile bug in `flatline_forecaster()` * add functionality to output the unfit workflow from the canned forecasters * add `pivot_quantiles_wider()` for easier plotting +* add complement `pivot_quantiles_longer()` # epipredict 0.0.4 diff --git a/R/pivot_quantiles.R b/R/pivot_quantiles.R index de4aa1e01..8f6e4875f 100644 --- a/R/pivot_quantiles.R +++ b/R/pivot_quantiles.R @@ -23,8 +23,8 @@ nested_quantiles <- function(x) { #' Pivot columns containing `dist_quantile` longer #' -#' Selected columns that contains `dist_quantiles` will be "lengthened" with -#' the "taus" (quantile) serving as 1 column and the values as another. If +#' Selected columns that contain `dist_quantiles` will be "lengthened" with +#' the quantile levels serving as 1 column and the values as another. If #' multiple columns are selected, these will be prefixed the the column name. #' #' @param .data A data frame, or a data frame extension such as a tibble or From 8d1e47d8cc49980bd762d813dad64b2a68273f20 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Thu, 5 Oct 2023 16:15:14 -0700 Subject: [PATCH 22/22] pass local checks --- R/cdc_baseline_forecaster.R | 2 +- R/layer_cdc_flatline_quantiles.R | 2 +- R/pivot_quantiles.R | 4 ++-- man/cdc_baseline_forecaster.Rd | 2 +- man/layer_cdc_flatline_quantiles.Rd | 2 +- man/pivot_quantiles_longer.Rd | 8 ++++---- vignettes/articles/sliding.Rmd | 6 +++--- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/R/cdc_baseline_forecaster.R b/R/cdc_baseline_forecaster.R index f1014d666..62e5172cb 100644 --- a/R/cdc_baseline_forecaster.R +++ b/R/cdc_baseline_forecaster.R @@ -34,7 +34,7 @@ #' filter(weekdays(time_value) == "Saturday") #' #' cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths") -#' preds <- pivot_quantiles(cdc$predictions, .pred_distn) +#' preds <- pivot_quantiles_wider(cdc$predictions, .pred_distn) #' #' if (require(ggplot2)) { #' forecast_date <- unique(preds$forecast_date) diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index f1a159b9a..bd2af0bf6 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -72,7 +72,7 @@ #' #' preds <- preds %>% #' unnest(.pred_distn_all) %>% -#' pivot_quantiles(.pred_distn) %>% +#' pivot_quantiles_wider(.pred_distn) %>% #' mutate(target_date = forecast_date + ahead) #' #' if (require("ggplot2")) { diff --git a/R/pivot_quantiles.R b/R/pivot_quantiles.R index 8f6e4875f..a156bcf90 100644 --- a/R/pivot_quantiles.R +++ b/R/pivot_quantiles.R @@ -25,7 +25,7 @@ nested_quantiles <- function(x) { #' #' Selected columns that contain `dist_quantiles` will be "lengthened" with #' the quantile levels serving as 1 column and the values as another. If -#' multiple columns are selected, these will be prefixed the the column name. +#' multiple columns are selected, these will be prefixed with the column name. #' #' @param .data A data frame, or a data frame extension such as a tibble or #' epi_df. @@ -39,7 +39,7 @@ nested_quantiles <- function(x) { #' has 7, then the only option would be to recycle everything, creating a #' _very_ long result. By default, this would throw an error. But if this is #' really the goal, then the error can be bypassed by setting this argument -#' to `TRUE`. The first selected column will vary fastest. +#' to `TRUE`. The quantiles in the first selected column will vary the fastest. #' #' @return An object of the same class as `.data`. #' @export diff --git a/man/cdc_baseline_forecaster.Rd b/man/cdc_baseline_forecaster.Rd index 65de1c29e..7e62a0521 100644 --- a/man/cdc_baseline_forecaster.Rd +++ b/man/cdc_baseline_forecaster.Rd @@ -49,7 +49,7 @@ weekly_deaths <- case_death_rate_subset \%>\% filter(weekdays(time_value) == "Saturday") cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths") -preds <- pivot_quantiles(cdc$predictions, .pred_distn) +preds <- pivot_quantiles_wider(cdc$predictions, .pred_distn) if (require(ggplot2)) { forecast_date <- unique(preds$forecast_date) diff --git a/man/layer_cdc_flatline_quantiles.Rd b/man/layer_cdc_flatline_quantiles.Rd index 71f414e25..594c63afb 100644 --- a/man/layer_cdc_flatline_quantiles.Rd +++ b/man/layer_cdc_flatline_quantiles.Rd @@ -102,7 +102,7 @@ preds preds <- preds \%>\% unnest(.pred_distn_all) \%>\% - pivot_quantiles(.pred_distn) \%>\% + pivot_quantiles_wider(.pred_distn) \%>\% mutate(target_date = forecast_date + ahead) if (require("ggplot2")) { diff --git a/man/pivot_quantiles_longer.Rd b/man/pivot_quantiles_longer.Rd index 1cb6f5165..f73e6deaf 100644 --- a/man/pivot_quantiles_longer.Rd +++ b/man/pivot_quantiles_longer.Rd @@ -21,15 +21,15 @@ reasonable. But if, for example, \code{var1[1]} has 5 quantiles while \code{var2 has 7, then the only option would be to recycle everything, creating a \emph{very} long result. By default, this would throw an error. But if this is really the goal, then the error can be bypassed by setting this argument -to \code{TRUE}. The first selected column will vary fastest.} +to \code{TRUE}. The quantiles in the first selected column will vary the fastest.} } \value{ An object of the same class as \code{.data}. } \description{ -Selected columns that contains \code{dist_quantiles} will be "lengthened" with -the "taus" (quantile) serving as 1 column and the values as another. If -multiple columns are selected, these will be prefixed the the column name. +Selected columns that contain \code{dist_quantiles} will be "lengthened" with +the quantile levels serving as 1 column and the values as another. If +multiple columns are selected, these will be prefixed with the column name. } \examples{ d1 <- c(dist_quantiles(1:3, 1:3 / 4), dist_quantiles(2:4, 1:3 / 4)) diff --git a/vignettes/articles/sliding.Rmd b/vignettes/articles/sliding.Rmd index 67af7289d..e889f0b74 100644 --- a/vignettes/articles/sliding.Rmd +++ b/vignettes/articles/sliding.Rmd @@ -129,7 +129,7 @@ fc <- bind_rows( ) ) %>% list_rbind() ) %>% - pivot_quantiles(fc_.pred_distn) + pivot_quantiles_wider(fc_.pred_distn) ``` Here, `arx_forecaster()` does all the heavy lifting. It creates leads of the @@ -225,7 +225,7 @@ can_fc <- bind_rows( ) ) %>% list_rbind() ) %>% - pivot_quantiles(fc_.pred_distn) + pivot_quantiles_wider(fc_.pred_distn) ``` The figures below shows the results for all of the provinces. @@ -327,7 +327,7 @@ k_week_version_aware <- function(ahead = 7, version_aware = TRUE) { fc <- bind_rows( map(aheads, ~ k_week_version_aware(.x, TRUE)) %>% list_rbind(), map(aheads, ~ k_week_version_aware(.x, FALSE)) %>% list_rbind() -) %>% pivot_quantiles(fc_.pred_distn) +) %>% pivot_quantiles_wider(fc_.pred_distn) ``` Now we can plot the results on top of the latest case rates. As before, we will only display and focus on the results for FL and CA for simplicity.