diff --git a/.Rbuildignore b/.Rbuildignore index 5139bcabe..cb36bb9d2 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -12,3 +12,4 @@ ^musings$ ^data-raw$ ^vignettes/articles$ +^.git-blame-ignore-revs$ diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index c4bcd6b68..eff7367ec 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -4,9 +4,9 @@ # Created with usethis + edited to use API key. on: push: - branches: [main, master] + branches: [main, master, v0.0.6] pull_request: - branches: [main, master] + branches: [main, master, v0.0.6] name: R-CMD-check diff --git a/NAMESPACE b/NAMESPACE index b22ec53a5..03b159764 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -34,6 +34,8 @@ S3method(extrapolate_quantiles,dist_default) S3method(extrapolate_quantiles,dist_quantiles) S3method(extrapolate_quantiles,distribution) S3method(fit,epi_workflow) +S3method(flusight_hub_formatter,canned_epipred) +S3method(flusight_hub_formatter,data.frame) S3method(format,dist_quantiles) S3method(is.na,dist_quantiles) S3method(is.na,distribution) @@ -52,6 +54,7 @@ S3method(print,alist) S3method(print,arx_class) S3method(print,arx_fcast) S3method(print,canned_epipred) +S3method(print,cdc_baseline_fcast) S3method(print,epi_workflow) S3method(print,flat_fcast) S3method(print,flatline) @@ -79,6 +82,7 @@ S3method(residuals,flatline) S3method(run_mold,default_epi_recipe_blueprint) S3method(slather,layer_add_forecast_date) S3method(slather,layer_add_target_date) +S3method(slather,layer_cdc_flatline_quantiles) S3method(slather,layer_naomit) S3method(slather,layer_point_from_distn) S3method(slather,layer_population_scaling) @@ -106,6 +110,8 @@ export(arx_classifier) export(arx_fcast_epi_workflow) export(arx_forecaster) export(bake) +export(cdc_baseline_args_list) +export(cdc_baseline_forecaster) export(create_layer) export(default_epi_recipe_blueprint) export(detect_layer) @@ -122,6 +128,7 @@ export(fit) export(flatline) export(flatline_args_list) export(flatline_forecaster) +export(flusight_hub_formatter) export(frosting) export(get_test_data) export(grab_names) @@ -131,6 +138,7 @@ export(is_layer) export(layer) export(layer_add_forecast_date) export(layer_add_target_date) +export(layer_cdc_flatline_quantiles) export(layer_naomit) export(layer_point_from_distn) export(layer_population_scaling) @@ -181,6 +189,7 @@ importFrom(rlang,caller_env) importFrom(rlang,is_empty) importFrom(rlang,is_null) importFrom(rlang,quos) +importFrom(smoothqr,smooth_qr) importFrom(stats,as.formula) importFrom(stats,family) importFrom(stats,lm) diff --git a/R/cdc_baseline_forecaster.R b/R/cdc_baseline_forecaster.R new file mode 100644 index 000000000..e66563b65 --- /dev/null +++ b/R/cdc_baseline_forecaster.R @@ -0,0 +1,228 @@ +#' Predict the future with the most recent value +#' +#' This is a simple forecasting model for +#' [epiprocess::epi_df] data. It uses the most recent observation as the +#' forecast for any future date, and produces intervals by shuffling the quantiles +#' of the residuals of such a "flatline" forecast and incrementing these +#' forward over all available training data. +#' +#' By default, the predictive intervals are computed separately for each +#' combination of `geo_value` in the `epi_data` argument. +#' +#' This forecaster is meant to produce exactly the CDC Baseline used for +#' [COVID19ForecastHub](https://covid19forecasthub.org) +#' +#' @param epi_data An [`epiprocess::epi_df`] +#' @param outcome A scalar character for the column name we wish to predict. +#' @param args_list A list of additional arguments as created by the +#' [cdc_baseline_args_list()] constructor function. +#' +#' @return A data frame of point and interval forecasts for all aheads (unique +#' horizons) for each unique combination of `key_vars`. +#' @export +#' +#' @examples +#' library(dplyr) +#' weekly_deaths <- case_death_rate_subset %>% +#' select(geo_value, time_value, death_rate) %>% +#' left_join(state_census %>% select(pop, abbr), by = c("geo_value" = "abbr")) %>% +#' mutate(deaths = pmax(death_rate / 1e5 * pop * 7, 0)) %>% +#' select(-pop, -death_rate) %>% +#' group_by(geo_value) %>% +#' epi_slide(~ sum(.$deaths), before = 6, new_col_name = "deaths") %>% +#' ungroup() %>% +#' filter(weekdays(time_value) == "Saturday") +#' +#' cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths") +#' preds <- pivot_quantiles(cdc$predictions, .pred_distn) +#' +#' if (require(ggplot2)) { +#' forecast_date <- unique(preds$forecast_date) +#' four_states <- c("ca", "pa", "wa", "ny") +#' preds %>% +#' filter(geo_value %in% four_states) %>% +#' ggplot(aes(target_date)) + +#' geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + +#' geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + +#' geom_line(aes(y = .pred), color = "orange") + +#' geom_line( +#' data = weekly_deaths %>% filter(geo_value %in% four_states), +#' aes(x = time_value, y = deaths) +#' ) + +#' scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + +#' labs(x = "Date", y = "Weekly deaths") + +#' facet_wrap(~geo_value, scales = "free_y") + +#' theme_bw() + +#' geom_vline(xintercept = forecast_date) +#' } +cdc_baseline_forecaster <- function( + epi_data, + outcome, + args_list = cdc_baseline_args_list()) { + validate_forecaster_inputs(epi_data, outcome, "time_value") + if (!inherits(args_list, c("cdc_flat_fcast", "alist"))) { + cli_stop("args_list was not created using `cdc_baseline_args_list().") + } + keys <- epi_keys(epi_data) + ek <- kill_time_value(keys) + outcome <- rlang::sym(outcome) + + + r <- epi_recipe(epi_data) %>% + step_epi_ahead(!!outcome, ahead = args_list$data_frequency, skip = TRUE) %>% + recipes::update_role(!!outcome, new_role = "predictor") %>% + recipes::add_role(tidyselect::all_of(keys), new_role = "predictor") %>% + step_training_window(n_recent = args_list$n_training) + + forecast_date <- args_list$forecast_date %||% max(epi_data$time_value) + # target_date <- args_list$target_date %||% forecast_date + args_list$ahead + + + latest <- get_test_data( + epi_recipe(epi_data), epi_data, TRUE, args_list$nafill_buffer, + forecast_date + ) + + f <- frosting() %>% + layer_predict() %>% + layer_cdc_flatline_quantiles( + aheads = args_list$aheads, + quantile_levels = args_list$quantile_levels, + nsims = args_list$nsims, + by_key = args_list$quantile_by_key, + symmetrize = args_list$symmetrize, + nonneg = args_list$nonneg + ) %>% + layer_add_forecast_date(forecast_date = forecast_date) %>% + layer_unnest(.pred_distn_all) + # layer_add_target_date(target_date = target_date) + if (args_list$nonneg) f <- layer_threshold(f, ".pred") + + eng <- parsnip::linear_reg() %>% parsnip::set_engine("flatline") + + wf <- epi_workflow(r, eng, f) + wf <- generics::fit(wf, epi_data) + preds <- suppressWarnings(predict(wf, new_data = latest)) %>% + tibble::as_tibble() %>% + dplyr::select(-time_value) %>% + dplyr::mutate(target_date = forecast_date + ahead * args_list$data_frequency) + + structure( + list( + predictions = preds, + epi_workflow = wf, + metadata = list( + training = attr(epi_data, "metadata"), + forecast_created = Sys.time() + ) + ), + class = c("cdc_baseline_fcast", "canned_epipred") + ) +} + + + +#' CDC baseline forecaster argument constructor +#' +#' Constructs a list of arguments for [cdc_baseline_forecaster()]. +#' +#' @inheritParams arx_args_list +#' @param data_frequency Integer or string. This describes the frequency of the +#' input `epi_df`. For typical FluSight forecasts, this would be `"1 week"`. +#' Allowable arguments are integers (taken to mean numbers of days) or a +#' string like `"7 days"` or `"2 weeks"`. Currently, all other periods +#' (other than days or weeks) result in an error. +#' @param aheads Integer vector. Unlike [arx_forecaster()], this doesn't have +#' any effect on the predicted values. +#' Predictions are always the most recent observation. This determines the +#' set of prediction horizons for [layer_cdc_flatline_quantiles()]`. It interacts +#' with the `data_frequency` argument. So, for example, if the data is daily +#' and you want forecasts for 1:4 days ahead, then you would use `1:4`. However, +#' if you want one-week predictions, you would set this as `c(7, 14, 21, 28)`. +#' But if `data_frequency` is `"1 week"`, then you would set it as `1:4`. +#' @param quantile_levels Vector or `NULL`. A vector of probabilities to produce +#' prediction intervals. These are created by computing the quantiles of +#' training residuals. A `NULL` value will result in point forecasts only. +#' @param nsims Positive integer. The number of draws from the empirical CDF. +#' These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting +#' in linear interpolation on the X scale. This is achieved with +#' [stats::quantile()] Type 7 (the default for that function). +#' @param nonneg Logical. Force all predictive intervals be non-negative. +#' Because non-negativity is forced _before_ propagating forward, this +#' has slightly different behaviour than would occur if using +#' [layer_threshold()]. +#' +#' @return A list containing updated parameter choices with class `cdc_flat_fcast`. +#' @export +#' +#' @examples +#' cdc_baseline_args_list() +#' cdc_baseline_args_list(symmetrize = FALSE) +#' cdc_baseline_args_list(quantile_levels = c(.1, .3, .7, .9), n_training = 120) +cdc_baseline_args_list <- function( + data_frequency = "1 week", + aheads = 1:4, + n_training = Inf, + forecast_date = NULL, + quantile_levels = c(.01, .025, 1:19 / 20, .975, .99), + nsims = 1e3L, + symmetrize = TRUE, + nonneg = TRUE, + quantile_by_key = "geo_value", + nafill_buffer = Inf) { + arg_is_scalar(n_training, nsims, data_frequency) + data_frequency <- parse_period(data_frequency) + arg_is_pos_int(data_frequency) + arg_is_chr(quantile_by_key, allow_empty = TRUE) + arg_is_scalar(forecast_date, allow_null = TRUE) + arg_is_date(forecast_date, allow_null = TRUE) + arg_is_nonneg_int(aheads, nsims) + arg_is_lgl(symmetrize, nonneg) + arg_is_probabilities(quantile_levels, allow_null = TRUE) + arg_is_pos(n_training) + if (is.finite(n_training)) arg_is_pos_int(n_training) + if (is.finite(nafill_buffer)) arg_is_pos_int(nafill_buffer, allow_null = TRUE) + + structure( + enlist( + data_frequency, + aheads, + n_training, + forecast_date, + quantile_levels, + nsims, + symmetrize, + nonneg, + quantile_by_key, + nafill_buffer + ), + class = c("cdc_baseline_fcast", "alist") + ) +} + +#' @export +print.cdc_baseline_fcast <- function(x, ...) { + name <- "CDC Baseline" + NextMethod(name = name, ...) +} + +parse_period <- function(x) { + arg_is_scalar(x) + if (is.character(x)) { + x <- unlist(strsplit(x, " ")) + if (length(x) == 1L) x <- as.numeric(x) + if (length(x) == 2L) { + mult <- substr(x[2], 1, 3) + mult <- switch( + mult, + day = 1L, + wee = 7L, + cli::cli_abort("incompatible timespan in `aheads`.") + ) + x <- as.numeric(x[1]) * mult + } + if (length(x) > 2L) cli::cli_abort("incompatible timespan in `aheads`.") + } + stopifnot(rlang::is_integerish(x)) + as.integer(x) +} diff --git a/R/compat-purrr.R b/R/compat-purrr.R index 7e28bd630..712926f73 100644 --- a/R/compat-purrr.R +++ b/R/compat-purrr.R @@ -32,6 +32,11 @@ map_chr <- function(.x, .f, ...) { .rlang_purrr_map_mold(.x, .f, character(1), ...) } +map_vec <- function(.x, .f, ...) { + out <- map(.x, .f, ...) + vctrs::list_unchop(out) +} + map_dfr <- function(.x, .f, ..., .id = NULL) { .f <- rlang::as_function(.f, env = rlang::global_env()) res <- map(.x, .f, ...) diff --git a/R/flatline_forecaster.R b/R/flatline_forecaster.R index e437f50ea..197c8cca5 100644 --- a/R/flatline_forecaster.R +++ b/R/flatline_forecaster.R @@ -94,6 +94,12 @@ flatline_forecaster <- function( #' Constructs a list of arguments for [flatline_forecaster()]. #' #' @inheritParams arx_args_list +#' @param ahead Integer. Unlike [arx_forecaster()], this doesn't have any effect +#' on the predicted values. Predictions are always the most recent observation. +#' However, this _does_ impact the residuals stored in the object. Residuals +#' are calculated based on this number to mimic how badly you would have done. +#' So for example, `ahead = 7` will create residuals by comparing values +#' 7 days apart. #' #' @return A list containing updated parameter choices with class `flatline_alist`. #' @export diff --git a/R/flusight_hub_formatter.R b/R/flusight_hub_formatter.R new file mode 100644 index 000000000..d433ab2a7 --- /dev/null +++ b/R/flusight_hub_formatter.R @@ -0,0 +1,131 @@ +abbr_to_fips <- function(abbr) { + fi <- dplyr::left_join( + tibble::tibble(abbr = tolower(abbr)), + state_census, by = "abbr") %>% + dplyr::mutate(fips = as.character(fips), fips = case_when( + fips == "0" ~ "US", + nchar(fips) < 2L ~ paste0("0", fips), + TRUE ~ fips + )) %>% + pull(.data$fips) + names(fi) <- NULL + fi +} + +#' Format predictions for submission to FluSight forecast Hub +#' +#' This function converts predictions from any of the included forecasters into +#' a format (nearly) ready for submission to the 2023-24 +#' [FluSight-forecast-hub](https://github.com/cdcepi/FluSight-forecast-hub). +#' See there for documentation of the required columns. Currently, only +#' "quantile" forcasts are supported, but the intention is to support both +#' "quantile" and "pmf". For this reason, adding the `output_type` column should +#' be done via the `...` argument. See the examples below. The specific required +#' format for this forecast task is [here](https://github.com/cdcepi/FluSight-forecast-hub/blob/main/model-output/README.md). +#' +#' @param object a data.frame of predictions or an object of class +#' `canned_epipred` as created by, e.g., [arx_forecaster()] +#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Name = value pairs of constant +#' columns (or mutations) to perform to the results. See examples. +#' @param .fcast_period Control whether the `horizon` should represent days or +#' weeks. Depending on whether the forecaster output has target dates +#' from [layer_add_target_date()] or not, we may need to compute the horizon +#' and/or the `target_end_date` from the other available columns in the predictions. +#' When both `ahead` and `target_date` are available, this is ignored. If only +#' `ahead` or `aheads` exists, then the target date may need to be multiplied +#' if the `ahead` represents weekly forecasts. Alternatively, if only, the +#' `target_date` is available, then the `horizon` will be in days, unless +#' this argument is `"weekly"`. Note that these can be adjusted later by the +#' `...` argument. +#' +#' @return A [tibble::tibble]. If `...` is empty, the result will contain the +#' columns `reference_date`, `horizon`, `target_end_date`, `location`, +#' `output_type_id`, and `value`. The `...` can perform mutations on any of +#' these. +#' @export +#' +#' @examples +#' library(dplyr) +#' weekly_deaths <- case_death_rate_subset %>% +#' select(geo_value, time_value, death_rate) %>% +#' left_join(state_census %>% select(pop, abbr), by = c("geo_value" = "abbr")) %>% +#' mutate(deaths = pmax(death_rate / 1e5 * pop * 7, 0)) %>% +#' select(-pop, -death_rate) %>% +#' group_by(geo_value) %>% +#' epi_slide(~ sum(.$deaths), before = 6, new_col_name = "deaths") %>% +#' ungroup() %>% +#' filter(weekdays(time_value) == "Saturday") +#' +#' cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths") +#' flusight_hub_formatter(cdc) +#' flusight_hub_formatter(cdc, target = "wk inc covid deaths") +#' flusight_hub_formatter(cdc, target = paste(horizon, "wk inc covid deaths")) +#' flusight_hub_formatter(cdc, target = "wk inc covid deaths", output_type = "quantile") +flusight_hub_formatter <- function( + object, ..., + .fcast_period = c("daily", "weekly")) { + UseMethod("flusight_hub_formatter") +} + +#' @export +flusight_hub_formatter.canned_epipred <- function( + object, ..., + .fcast_period = c("daily", "weekly")) { + flusight_hub_formatter(object$predictions, ..., .fcast_period = .fcast_period) +} + +#' @export +flusight_hub_formatter.data.frame <- function( + object, ..., + .fcast_period = c("daily", "weekly")) { + required_names <- c(".pred", ".pred_distn", "forecast_date", "geo_value") + optional_names <- c("ahead", "target_date") + hardhat::validate_column_names(object, required_names) + if (!any(optional_names %in% names(object))) { + cli::cli_abort("At least one of {.val {optional_names}} must be present.") + } + + dots <- enquos(..., .named = TRUE) + names <- names(dots) + + object <- object %>% + # combine the predictions and the distribution + dplyr::mutate(.pred_distn = nested_quantiles(.pred_distn)) %>% + dplyr::rowwise() %>% + dplyr::mutate( + .pred_distn = list(add_row(.pred_distn, q = .pred, tau = NA)), + .pred = NULL + ) %>% + tidyr::unnest(.pred_distn) %>% + # now we create the correct column names + dplyr::rename( + value = q, + output_type_id = tau, + reference_date = forecast_date + ) %>% + # convert to fips codes, and add any constant cols passed in ... + dplyr::mutate(location = abbr_to_fips(tolower(geo_value)), geo_value = NULL) + + # create target_end_date / horizon, depending on what is available + pp <- ifelse(match.arg(.fcast_period) == "daily", 1L, 7L) + has_ahead <- charmatch("ahead", names(object)) + if ("target_date" %in% names(object) && !is.na(has_ahead)) { + object <- object %>% + dplyr::rename( + target_end_date = target_date, + horizon = !!names(object)[has_ahead] + ) + } else if (!is.na(has_ahead)) { # ahead present, not target date + object <- object %>% + dplyr::rename(horizon = !!names(object)[has_ahead]) %>% + dplyr::mutate(target_end_date = horizon * pp + reference_date) + } else { # target_date present, not ahead + object <- object %>% + dplyr::rename(target_end_date = target_date) %>% + dplyr::mutate(horizon = as.integer((target_end_date - reference_date)) / pp) + } + object %>% dplyr::relocate( + reference_date, horizon, target_end_date, location, output_type_id, value + ) %>% + dplyr::mutate(!!!dots) +} diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R new file mode 100644 index 000000000..312c3630e --- /dev/null +++ b/R/layer_cdc_flatline_quantiles.R @@ -0,0 +1,280 @@ +#' CDC Flatline Forecast Quantiles +#' +#' This layer creates quantile forecasts by taking a sample from the +#' interpolated CDF of the flatline residuals, and shuffling them. +#' These are then added on to the point prediction. +#' +#' @details +#' This layer is intended to be used in concert with [flatline()]. But it can +#' also be used with anything else. As long as residuals are available in the +#' the fitted model, this layer could be useful. Like +#' [layer_residual_quantiles()] it only uses the residuals for the fitted model +#' object. However, it propagates these forward for *all* aheads, by +#' iteratively shuffling them (randomly), and then adding them to the previous +#' set. This is in contrast to what happens with the [flatline_forecaster()]. +#' When using [flatline()] as the underlying engine (here), both will result in the +#' same predictions (the most recent observed value), but that model calculates +#' separate residuals for each `ahead` by comparing to observations further into +#' the future. This version continues to use the same set of residuals, and +#' adds them on to produce wider intervals as `ahead` increases. +#' +#' +#' @inheritParams layer_residual_quantiles +#' @param aheads Numeric vector of desired forecast horizons. These should be +#' given in the "units of the training data". So, for example, for data +#' typically observed daily (possibly with missing values), but with weekly +#' forecast targets, you would use `c(7, 14, 21, 28)`. But with weekly data, +#' you would use `1:4`. +#' @param quantile_levels Numeric vector of probabilities with values in (0,1) +#' referring to the desired predictive intervals. The default is the standard +#' set for the COVID Forecast Hub. +#' @param nsims Positive integer. The number of draws from the empirical CDF. +#' These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting in +#' linear interpolation on the X scale. This is achieved with +#' [stats::quantile()] Type 7 (the default for that function). +#' @param symmetrize Scalar logical. If `TRUE`, does two things: (i) forces the +#' "empirical" CDF of residuals to be symmetric by pretending that for every +#' actually-observed residual X we also observed another residual -X, and (ii) +#' at each ahead, forces the median simulated value to be equal to the point +#' prediction by adding or subtracting the same amount to every simulated +#' value. Adjustments in (ii) take place before propagating forward and +#' simulating the next ahead. This forces any 1-ahead predictive intervals to +#' be symmetric about the point prediction, and encourages larger aheads to be +#' more symmetric. +#' @param nonneg Scalar logical. Force all predictive intervals be non-negative. +#' Because non-negativity is forced _before_ propagating forward, this has +#' slightly different behaviour than would occur if using [layer_threshold()]. +#' Thresholding at each ahead takes place after any shifting from +#' `symmetrize`. +#' +#' @return an updated `frosting` postprocessor. Calling [predict()] will result +#' in an additional `` named `.pred_distn_all` containing 2-column +#' [tibble::tibble()]'s. For each +#' desired combination of `key`'s, the tibble will contain one row per ahead +#' with the associated [dist_quantiles()]. +#' @export +#' +#' @examples +#' r <- epi_recipe(case_death_rate_subset) %>% +#' # data is "daily", so we fit this to 1 ahead, the result will contain +#' # 1 day ahead residuals +#' step_epi_ahead(death_rate, ahead = 1L, skip = TRUE) %>% +#' recipes::update_role(death_rate, new_role = "predictor") %>% +#' recipes::add_role(time_value, geo_value, new_role = "predictor") +#' +#' forecast_date <- max(case_death_rate_subset$time_value) +#' +#' latest <- get_test_data( +#' epi_recipe(case_death_rate_subset), case_death_rate_subset +#' ) +#' +#' f <- frosting() %>% +#' layer_predict() %>% +#' layer_cdc_flatline_quantiles(aheads = c(7, 14, 21, 28), symmetrize = TRUE) +#' +#' eng <- parsnip::linear_reg() %>% parsnip::set_engine("flatline") +#' +#' wf <- epi_workflow(r, eng, f) %>% fit(case_death_rate_subset) +#' preds <- suppressWarnings(predict(wf, new_data = latest)) %>% +#' dplyr::select(-time_value) %>% +#' dplyr::mutate(forecast_date = forecast_date) +#' preds +#' +#' preds <- preds %>% +#' unnest(.pred_distn_all) %>% +#' pivot_quantiles(.pred_distn) %>% +#' mutate(target_date = forecast_date + ahead) +#' +#' if (require("ggplot2")) { +#' four_states <- c("ca", "pa", "wa", "ny") +#' preds %>% +#' filter(geo_value %in% four_states) %>% +#' ggplot(aes(target_date)) + +#' geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + +#' geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + +#' geom_line(aes(y = .pred), color = "orange") + +#' geom_line( +#' data = case_death_rate_subset %>% filter(geo_value %in% four_states), +#' aes(x = time_value, y = death_rate) +#' ) + +#' scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + +#' labs(x = "Date", y = "Death rate") + +#' facet_wrap(~geo_value, scales = "free_y") + +#' theme_bw() + +#' geom_vline(xintercept = forecast_date) +#' } +layer_cdc_flatline_quantiles <- function( + frosting, + ..., + aheads = 1:4, + quantile_levels = c(.01, .025, 1:19 / 20, .975, .99), + nsims = 1e3, + by_key = "geo_value", + symmetrize = FALSE, + nonneg = TRUE, + id = rand_id("cdc_baseline_bands")) { + rlang::check_dots_empty() + + arg_is_int(aheads) + arg_is_probabilities(quantile_levels, allow_null = TRUE) + arg_is_pos_int(nsims) + arg_is_scalar(nsims) + arg_is_chr_scalar(id) + arg_is_lgl_scalar(symmetrize, nonneg) + arg_is_chr(by_key, allow_null = TRUE, allow_na = TRUE, allow_empty = TRUE) + + add_layer( + frosting, + layer_cdc_flatline_quantiles_new( + aheads = aheads, + quantile_levels = quantile_levels, + nsims = nsims, + by_key = by_key, + symmetrize = symmetrize, + nonneg = nonneg, + id = id + ) + ) +} + +layer_cdc_flatline_quantiles_new <- function( + aheads, + quantile_levels, + nsims, + by_key, + symmetrize, + nonneg, + id) { + layer( + "cdc_flatline_quantiles", + aheads = aheads, + quantile_levels = quantile_levels, + nsims = nsims, + by_key = by_key, + symmetrize = symmetrize, + nonneg = nonneg, + id = id + ) +} + +#' @export +slather.layer_cdc_flatline_quantiles <- + function(object, components, workflow, new_data, ...) { + if (is.null(object$quantile_levels)) return(components) + the_fit <- workflows::extract_fit_parsnip(workflow) + if (!inherits(the_fit, "_flatline")) { + cli::cli_warn( + c( + "Predictions for this workflow were not produced by the {.cls flatline}", + "{.pkg parsnip} engine. Results may be unexpected. See {.fn epipredict::flatline}." + ) + ) + } + p <- components$predictions + ek <- kill_time_value(epi_keys_mold(components$mold)) + r <- grab_residuals(the_fit, components) + + avail_grps <- character(0L) + if (length(object$by_key) > 0L) { + cols_in_preds <- hardhat::check_column_names(p, object$by_key) + if (!cols_in_preds$ok) { + cli::cli_warn(c( + "Predicted values are missing key columns: {.val {cols_in_preds$missing_names}}.", + "Ignoring these." + )) + } + if (inherits(the_fit, "_flatline")) { + cols_in_resids <- hardhat::check_column_names(r, object$by_key) + if (!cols_in_resids$ok) { + cli::cli_warn(c( + "Existing residuals are missing key columns: {.val {cols_in_resids$missing_names}}.", + "Ignoring these." + )) + } + # use only the keys that are in the predictions and requested. + avail_grps <- intersect(ek, setdiff( + object$by_key, + c(cols_in_preds$missing_names, cols_in_resids$missing_names) + )) + } else { # not flatline, but we'll try + key_cols <- dplyr::bind_cols( + geo_value = components$mold$extras$roles$geo_value, + components$mold$extras$roles$key + ) + cols_in_resids <- hardhat::check_column_names(key_cols, object$by_key) + if (!cols_in_resids$ok) { + cli::cli_warn(c( + "Requested residuals are missing key columns: {.val {cols_in_resids$missing_names}}.", + "Ignoring these." + )) + } + avail_grps <- intersect(names(key_cols), setdiff( + object$by_key, + c(cols_in_preds$missing_names, cols_in_resids$missing_names) + )) + r <- dplyr::bind_cols(key_cols, r) + } + } + r <- r %>% + dplyr::select(tidyselect::all_of(c(avail_grps, ".resid"))) %>% + dplyr::group_by(!!!rlang::syms(avail_grps)) %>% + dplyr::summarise(.resid = list(.resid), .groups = "drop") + + res <- dplyr::left_join(p, r, by = avail_grps) %>% + dplyr::rowwise() %>% + dplyr::mutate( + .pred_distn_all = propagate_samples( + .resid, .pred, object$quantile_levels, + object$aheads, object$nsim, object$symmetrize, object$nonneg + ) + ) %>% + dplyr::select(tidyselect::all_of(c(avail_grps, ".pred_distn_all"))) + + # res <- check_pname(res, components$predictions, object) + components$predictions <- dplyr::left_join( + components$predictions, + res, + by = avail_grps + ) + components + } + +propagate_samples <- function( + r, p, quantile_levels, aheads, nsim, symmetrize, nonneg) { + max_ahead <- max(aheads) + if (symmetrize) { + r <- c(r, -r) + } + samp <- quantile(r, probs = c(0, seq_len(nsim - 1)) / (nsim - 1), + na.rm = TRUE, names = FALSE) + res <- list() + + raw <- samp + p + if (nonneg) raw <- pmax(0, raw) + res[[1]] <- raw + if (max_ahead > 1L) { + for (iter in 2:max_ahead) { + samp <- shuffle(samp) + raw <- raw + samp + if (symmetrize) { + symmetric <- raw - (median(raw) - p) + } else { + symmetric <- raw + } + if (nonneg) symmetric <- pmax(0, symmetric) + res[[iter]] <- symmetric + } + } + res <- res[aheads] + list(tibble::tibble( + ahead = aheads, + .pred_distn = map_vec( + res, ~ dist_quantiles(quantile(.x, quantile_levels), quantile_levels) + ) + )) +} + +shuffle <- function(x) { + stopifnot(is.vector(x)) + sample(x, length(x), replace = FALSE) +} diff --git a/R/layer_residual_quantiles.R b/R/layer_residual_quantiles.R index a9a8cab24..b9a71e265 100644 --- a/R/layer_residual_quantiles.R +++ b/R/layer_residual_quantiles.R @@ -141,8 +141,8 @@ grab_residuals <- function(the_fit, components) { if (".resid" %in% names(r)) { # success return(r) } else { # failure - rlang::warn(c( - "The `residuals()` method for objects of class {cl} results in", + cli::cli_warn(c( + "The `residuals()` method for {.cls cl} objects results in", "a data frame without a column named `.resid`.", i = "Residual quantiles will be calculated directly from the", i = "difference between predictions and observations.", @@ -152,8 +152,8 @@ grab_residuals <- function(the_fit, components) { } else if (is.vector(drop(r))) { # also success return(tibble(.resid = drop(r))) } else { # failure - rlang::warn(c( - "The `residuals()` method for objects of class {cl} results in an", + cli::cli_warn(c( + "The `residuals()` method for {.cls cl} objects results in an", "object that is neither a data frame with a column named `.resid`,", "nor something coercible to a vector.", i = "Residual quantiles will be calculated directly from the", diff --git a/R/make_smooth_quantile_reg.R b/R/make_smooth_quantile_reg.R index 6eab2a132..7d25a8c6b 100644 --- a/R/make_smooth_quantile_reg.R +++ b/R/make_smooth_quantile_reg.R @@ -21,7 +21,7 @@ #' #' @seealso [fit.model_spec()], [set_engine()] #' -#' @importFrom quantreg rq +#' @importFrom smoothqr smooth_qr #' @examples #' tib <- data.frame( #' y1 = rnorm(100), y2 = rnorm(100), y3 = rnorm(100), @@ -61,7 +61,8 @@ #' lines(pl$x, pl$`0.2`, col = "blue") #' lines(pl$x, pl$`0.8`, col = "blue") #' lines(pl$x, pl$`0.5`, col = "red") -#' \dontrun{ +#' +#' if (require("ggplot2")) { #' ggplot(data.frame(x = x, y = y), aes(x)) + #' geom_ribbon(data = pl, aes(ymin = `0.2`, ymax = `0.8`), fill = "lightblue") + #' geom_point(aes(y = y), colour = "grey") + # observed data diff --git a/R/step_growth_rate.R b/R/step_growth_rate.R index f6ad29a5b..74cfff284 100644 --- a/R/step_growth_rate.R +++ b/R/step_growth_rate.R @@ -42,20 +42,19 @@ #' recipes::prep() %>% #' recipes::bake(case_death_rate_subset) step_growth_rate <- - function( - recipe, - ..., - role = "predictor", - trained = FALSE, - horizon = 7, - method = c("rel_change", "linear_reg", "smooth_spline", "trend_filter"), - log_scale = FALSE, - replace_Inf = NA, - prefix = "gr_", - columns = NULL, - skip = FALSE, - id = rand_id("growth_rate"), - additional_gr_args_list = list()) { + function(recipe, + ..., + role = "predictor", + trained = FALSE, + horizon = 7, + method = c("rel_change", "linear_reg", "smooth_spline", "trend_filter"), + log_scale = FALSE, + replace_Inf = NA, + prefix = "gr_", + columns = NULL, + skip = FALSE, + id = rand_id("growth_rate"), + additional_gr_args_list = list()) { if (!is_epi_recipe(recipe)) { rlang::abort("This recipe step can only operate on an `epi_recipe`.") } diff --git a/R/step_lag_difference.R b/R/step_lag_difference.R index 2482be46a..21878eaa7 100644 --- a/R/step_lag_difference.R +++ b/R/step_lag_difference.R @@ -23,16 +23,15 @@ #' recipes::prep() %>% #' recipes::bake(case_death_rate_subset) step_lag_difference <- - function( - recipe, - ..., - role = "predictor", - trained = FALSE, - horizon = 7, - prefix = "lag_diff_", - columns = NULL, - skip = FALSE, - id = rand_id("lag_diff")) { + function(recipe, + ..., + role = "predictor", + trained = FALSE, + horizon = 7, + prefix = "lag_diff_", + columns = NULL, + skip = FALSE, + id = rand_id("lag_diff")) { if (!is_epi_recipe(recipe)) { rlang::abort("This recipe step can only operate on an `epi_recipe`.") } diff --git a/R/utils-enframer.R b/R/utils-enframer.R index 387d04356..0a8152906 100644 --- a/R/utils-enframer.R +++ b/R/utils-enframer.R @@ -13,10 +13,11 @@ enframer <- function(df, x, fill = NA) { } enlist <- function(...) { - # in epiprocess - x <- list(...) - n <- as.character(sys.call())[-1] - if (!is.null(n0 <- names(x))) n[n0 != ""] <- n0[n0 != ""] - names(x) <- n - x + # converted to thin wrapper around + rlang::dots_list( + ..., + .homonyms = "error", + .named = TRUE, + .check_assign = TRUE + ) } diff --git a/_pkgdown.yml b/_pkgdown.yml index 2ad03c277..89fd733e1 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -34,6 +34,7 @@ reference: contents: - contains("flatline") - contains("arx") + - contains("cdc") - title: Parsnip engines desc: Prediction methods not available elsewhere contents: diff --git a/man/add_frosting.Rd b/man/add_frosting.Rd index d7d217777..4d77572a1 100644 --- a/man/add_frosting.Rd +++ b/man/add_frosting.Rd @@ -35,7 +35,9 @@ latest <- jhu \%>\% dplyr::filter(time_value >= max(time_value) - 14) # Add frosting to a workflow and predict -f <- frosting() \%>\% layer_predict() \%>\% layer_naomit(.pred) +f <- frosting() \%>\% + layer_predict() \%>\% + layer_naomit(.pred) wf1 <- wf \%>\% add_frosting(f) p1 <- predict(wf1, latest) p1 diff --git a/man/arx_fcast_epi_workflow.Rd b/man/arx_fcast_epi_workflow.Rd index fdd309959..7a6b66305 100644 --- a/man/arx_fcast_epi_workflow.Rd +++ b/man/arx_fcast_epi_workflow.Rd @@ -41,12 +41,16 @@ use \code{\link[=quantile_reg]{quantile_reg()}}) but can be omitted. jhu <- case_death_rate_subset \%>\% dplyr::filter(time_value >= as.Date("2021-12-01")) -arx_fcast_epi_workflow(jhu, "death_rate", - c("case_rate", "death_rate")) +arx_fcast_epi_workflow( + jhu, "death_rate", + c("case_rate", "death_rate") +) arx_fcast_epi_workflow(jhu, "death_rate", - c("case_rate", "death_rate"), trainer = quantile_reg(), - args_list = arx_args_list(levels = 1:9 / 10)) + c("case_rate", "death_rate"), + trainer = quantile_reg(), + args_list = arx_args_list(levels = 1:9 / 10) +) } \seealso{ \code{\link[=arx_forecaster]{arx_forecaster()}} diff --git a/man/arx_forecaster.Rd b/man/arx_forecaster.Rd index d4866aa0e..e121f272c 100644 --- a/man/arx_forecaster.Rd +++ b/man/arx_forecaster.Rd @@ -41,12 +41,16 @@ that it estimates a model for a particular target horizon. jhu <- case_death_rate_subset \%>\% dplyr::filter(time_value >= as.Date("2021-12-01")) -out <- arx_forecaster(jhu, "death_rate", - c("case_rate", "death_rate")) +out <- arx_forecaster( + jhu, "death_rate", + c("case_rate", "death_rate") +) out <- arx_forecaster(jhu, "death_rate", - c("case_rate", "death_rate"), trainer = quantile_reg(), - args_list = arx_args_list(levels = 1:9 / 10)) + c("case_rate", "death_rate"), + trainer = quantile_reg(), + args_list = arx_args_list(levels = 1:9 / 10) +) } \seealso{ \code{\link[=arx_fcast_epi_workflow]{arx_fcast_epi_workflow()}}, \code{\link[=arx_args_list]{arx_args_list()}} diff --git a/man/cdc_baseline_args_list.Rd b/man/cdc_baseline_args_list.Rd new file mode 100644 index 000000000..2f6546f74 --- /dev/null +++ b/man/cdc_baseline_args_list.Rd @@ -0,0 +1,85 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cdc_baseline_forecaster.R +\name{cdc_baseline_args_list} +\alias{cdc_baseline_args_list} +\title{CDC baseline forecaster argument constructor} +\usage{ +cdc_baseline_args_list( + data_frequency = "1 week", + aheads = 1:4, + n_training = Inf, + forecast_date = NULL, + quantile_levels = c(0.01, 0.025, 1:19/20, 0.975, 0.99), + nsims = 1000L, + symmetrize = TRUE, + nonneg = TRUE, + quantile_by_key = "geo_value", + nafill_buffer = Inf +) +} +\arguments{ +\item{data_frequency}{Integer or string. This describes the frequency of the +input \code{epi_df}. For typical FluSight forecasts, this would be \code{"1 week"}. +Allowable arguments are integers (taken to mean numbers of days) or a +string like \code{"7 days"} or \code{"2 weeks"}. Currently, all other periods +(other than days or weeks) result in an error.} + +\item{aheads}{Integer vector. Unlike \code{\link[=arx_forecaster]{arx_forecaster()}}, this doesn't have +any effect on the predicted values. +Predictions are always the most recent observation. This determines the +set of prediction horizons for \code{\link[=layer_cdc_flatline_quantiles]{layer_cdc_flatline_quantiles()}}\verb{. It interacts with the }data_frequency\verb{argument. So, for example, if the data is daily and you want forecasts for 1:4 days ahead, then you would use}1:4\verb{. However, if you want one-week predictions, you would set this as }c(7, 14, 21, 28)\verb{. But if }data_frequency\code{is}"1 week"\verb{, then you would set it as }1:4`.} + +\item{n_training}{Integer. An upper limit for the number of rows per +key that are used for training +(in the time unit of the \code{epi_df}).} + +\item{forecast_date}{Date. The date on which the forecast is created. +The default \code{NULL} will attempt to determine this automatically.} + +\item{quantile_levels}{Vector or \code{NULL}. A vector of probabilities to produce +prediction intervals. These are created by computing the quantiles of +training residuals. A \code{NULL} value will result in point forecasts only.} + +\item{nsims}{Positive integer. The number of draws from the empirical CDF. +These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting +in linear interpolation on the X scale. This is achieved with +\code{\link[stats:quantile]{stats::quantile()}} Type 7 (the default for that function).} + +\item{symmetrize}{Logical. The default \code{TRUE} calculates +symmetric prediction intervals. This argument only applies when +residual quantiles are used. It is not applicable with +\code{trainer = quantile_reg()}, for example.} + +\item{nonneg}{Logical. Force all predictive intervals be non-negative. +Because non-negativity is forced \emph{before} propagating forward, this +has slightly different behaviour than would occur if using +\code{\link[=layer_threshold]{layer_threshold()}}.} + +\item{quantile_by_key}{Character vector. Groups residuals by listed keys +before calculating residual quantiles. See the \code{by_key} argument to +\code{\link[=layer_residual_quantiles]{layer_residual_quantiles()}} for more information. The default, +\code{character(0)} performs no grouping. This argument only applies when +residual quantiles are used. It is not applicable with +\code{trainer = quantile_reg()}, for example.} + +\item{nafill_buffer}{At predict time, recent values of the training data +are used to create a forecast. However, these can be \code{NA} due to, e.g., +data latency issues. By default, any missing values will get filled with +less recent data. Setting this value to \code{NULL} will result in 1 extra +recent row (beyond those required for lag creation) to be used. Note that +we require at least \code{min(lags)} rows of recent data per \code{geo_value} to +create a prediction. For this reason, setting \code{nafill_buffer < min(lags)} +will be treated as \emph{additional} allowed recent data rather than the +total amount of recent data to examine.} +} +\value{ +A list containing updated parameter choices with class \code{cdc_flat_fcast}. +} +\description{ +Constructs a list of arguments for \code{\link[=cdc_baseline_forecaster]{cdc_baseline_forecaster()}}. +} +\examples{ +cdc_baseline_args_list() +cdc_baseline_args_list(symmetrize = FALSE) +cdc_baseline_args_list(quantile_levels = c(.1, .3, .7, .9), n_training = 120) +} diff --git a/man/cdc_baseline_forecaster.Rd b/man/cdc_baseline_forecaster.Rd new file mode 100644 index 000000000..3f5faa329 --- /dev/null +++ b/man/cdc_baseline_forecaster.Rd @@ -0,0 +1,73 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cdc_baseline_forecaster.R +\name{cdc_baseline_forecaster} +\alias{cdc_baseline_forecaster} +\title{Predict the future with the most recent value} +\usage{ +cdc_baseline_forecaster( + epi_data, + outcome, + args_list = cdc_baseline_args_list() +) +} +\arguments{ +\item{epi_data}{An \code{\link[epiprocess:epi_df]{epiprocess::epi_df}}} + +\item{outcome}{A scalar character for the column name we wish to predict.} + +\item{args_list}{A list of additional arguments as created by the +\code{\link[=cdc_baseline_args_list]{cdc_baseline_args_list()}} constructor function.} +} +\value{ +A data frame of point and interval forecasts for all aheads (unique +horizons) for each unique combination of \code{key_vars}. +} +\description{ +This is a simple forecasting model for +\link[epiprocess:epi_df]{epiprocess::epi_df} data. It uses the most recent observation as the +forecast for any future date, and produces intervals by shuffling the quantiles +of the residuals of such a "flatline" forecast and incrementing these +forward over all available training data. +} +\details{ +By default, the predictive intervals are computed separately for each +combination of \code{geo_value} in the \code{epi_data} argument. + +This forecaster is meant to produce exactly the CDC Baseline used for +\href{https://covid19forecasthub.org}{COVID19ForecastHub} +} +\examples{ +library(dplyr) +weekly_deaths <- case_death_rate_subset \%>\% + select(geo_value, time_value, death_rate) \%>\% + left_join(state_census \%>\% select(pop, abbr), by = c("geo_value" = "abbr")) \%>\% + mutate(deaths = pmax(death_rate / 1e5 * pop * 7, 0)) \%>\% + select(-pop, -death_rate) \%>\% + group_by(geo_value) \%>\% + epi_slide(~ sum(.$deaths), before = 6, new_col_name = "deaths") \%>\% + ungroup() \%>\% + filter(weekdays(time_value) == "Saturday") + +cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths") +preds <- pivot_quantiles(cdc$predictions, .pred_distn) + +if (require(ggplot2)) { +forecast_date <- unique(preds$forecast_date) +four_states <- c("ca", "pa", "wa", "ny") +preds \%>\% + filter(geo_value \%in\% four_states) \%>\% + ggplot(aes(target_date)) + + geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + + geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + + geom_line(aes(y = .pred), color = "orange") + + geom_line( + data = weekly_deaths \%>\% filter(geo_value \%in\% four_states), + aes(x = time_value, y = deaths) + ) + + scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + + labs(x = "Date", y = "Weekly deaths") + + facet_wrap(~geo_value, scales = "free_y") + + theme_bw() + + geom_vline(xintercept = forecast_date) +} +} diff --git a/man/create_layer.Rd b/man/create_layer.Rd index 399d62efa..d36385fb2 100644 --- a/man/create_layer.Rd +++ b/man/create_layer.Rd @@ -20,9 +20,9 @@ fill in the name of the layer, and open the file. \examples{ \dontrun{ - # Note: running this will write `layer_strawberry.R` to - # the `R/` directory of your current project - create_layer("strawberry") +# Note: running this will write `layer_strawberry.R` to +# the `R/` directory of your current project +create_layer("strawberry") } } diff --git a/man/dist_quantiles.Rd b/man/dist_quantiles.Rd index 50f00dc32..739bae5a8 100644 --- a/man/dist_quantiles.Rd +++ b/man/dist_quantiles.Rd @@ -15,7 +15,7 @@ dist_quantiles(x, tau) A distribution parameterized by a set of quantiles } \examples{ -dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2,.4,.6,.8))) +dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2, .4, .6, .8))) quantile(dstn, p = c(.1, .25, .5, .9)) median(dstn) diff --git a/man/extrapolate_quantiles.Rd b/man/extrapolate_quantiles.Rd index 985d7cae8..cc6cb2c3c 100644 --- a/man/extrapolate_quantiles.Rd +++ b/man/extrapolate_quantiles.Rd @@ -24,12 +24,14 @@ library(distributional) dstn <- dist_normal(c(10, 2), c(5, 10)) extrapolate_quantiles(dstn, p = c(.25, 0.5, .75)) -dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2,.4,.6,.8))) +dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2, .4, .6, .8))) # because this distribution is already quantiles, any extra quantiles are # appended extrapolate_quantiles(dstn, p = c(.25, 0.5, .75)) -dstn <- c(dist_normal(c(10, 2), c(5, 10)), - dist_quantiles(list(1:4, 8:11), list(c(.2,.4,.6,.8)))) +dstn <- c( + dist_normal(c(10, 2), c(5, 10)), + dist_quantiles(list(1:4, 8:11), list(c(.2, .4, .6, .8))) +) extrapolate_quantiles(dstn, p = c(.25, 0.5, .75)) } diff --git a/man/fit-epi_workflow.Rd b/man/fit-epi_workflow.Rd index fb1c3af28..3dfa0029a 100644 --- a/man/fit-epi_workflow.Rd +++ b/man/fit-epi_workflow.Rd @@ -29,7 +29,7 @@ preprocessing the data and fitting the underlying parsnip model. } \examples{ jhu <- case_death_rate_subset \%>\% -filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) + filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% diff --git a/man/flatline.Rd b/man/flatline.Rd index a396cfeb9..c353ff163 100644 --- a/man/flatline.Rd +++ b/man/flatline.Rd @@ -38,8 +38,10 @@ This is an internal function that is used to create a \code{\link[parsnip:linear model. It has somewhat odd behaviour (see below). } \examples{ -tib <- data.frame(y = runif(100), - expand.grid(k = letters[1:4], j = letters[5:9], time_value = 1:5)) \%>\% +tib <- data.frame( + y = runif(100), + expand.grid(k = letters[1:4], j = letters[5:9], time_value = 1:5) +) \%>\% dplyr::group_by(k, j) \%>\% dplyr::mutate(y2 = dplyr::lead(y, 2)) # predict 2 steps ahead flat <- flatline(y2 ~ j + k + y, tib) # predictions for 20 locations diff --git a/man/flatline_args_list.Rd b/man/flatline_args_list.Rd index 55d93c1db..dcae448f1 100644 --- a/man/flatline_args_list.Rd +++ b/man/flatline_args_list.Rd @@ -17,8 +17,12 @@ flatline_args_list( ) } \arguments{ -\item{ahead}{Integer. Number of time steps ahead (in days) of the forecast -date for which forecasts should be produced.} +\item{ahead}{Integer. Unlike \code{\link[=arx_forecaster]{arx_forecaster()}}, this doesn't have any effect +on the predicted values. Predictions are always the most recent observation. +However, this \emph{does} impact the residuals stored in the object. Residuals +are calculated based on this number to mimic how badly you would have done. +So for example, \code{ahead = 7} will create residuals by comparing values +7 days apart.} \item{n_training}{Integer. An upper limit for the number of rows per key that are used for training diff --git a/man/flusight_hub_formatter.Rd b/man/flusight_hub_formatter.Rd new file mode 100644 index 000000000..d8a4571f4 --- /dev/null +++ b/man/flusight_hub_formatter.Rd @@ -0,0 +1,60 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/flusight_hub_formatter.R +\name{flusight_hub_formatter} +\alias{flusight_hub_formatter} +\title{Format predictions for submission to FluSight forecast Hub} +\usage{ +flusight_hub_formatter(object, ..., .fcast_period = c("daily", "weekly")) +} +\arguments{ +\item{object}{a data.frame of predictions or an object of class +\code{canned_epipred} as created by, e.g., \code{\link[=arx_forecaster]{arx_forecaster()}}} + +\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name = value pairs of constant +columns (or mutations) to perform to the results. See examples.} + +\item{.fcast_period}{Control whether the \code{horizon} should represent days or +weeks. Depending on whether the forecaster output has target dates +from \code{\link[=layer_add_target_date]{layer_add_target_date()}} or not, we may need to compute the horizon +and/or the \code{target_end_date} from the other available columns in the predictions. +When both \code{ahead} and \code{target_date} are available, this is ignored. If only +\code{ahead} or \code{aheads} exists, then the target date may need to be multiplied +if the \code{ahead} represents weekly forecasts. Alternatively, if only, the +\code{target_date} is available, then the \code{horizon} will be in days, unless +this argument is \code{"weekly"}. Note that these can be adjusted later by the +\code{...} argument.} +} +\value{ +A \link[tibble:tibble]{tibble::tibble}. If \code{...} is empty, the result will contain the +columns \code{reference_date}, \code{horizon}, \code{target_end_date}, \code{location}, +\code{output_type_id}, and \code{value}. The \code{...} can perform mutations on any of +these. +} +\description{ +This function converts predictions from any of the included forecasters into +a format (nearly) ready for submission to the 2023-24 +\href{https://github.com/cdcepi/FluSight-forecast-hub}{FluSight-forecast-hub}. +See there for documentation of the required columns. Currently, only +"quantile" forcasts are supported, but the intention is to support both +"quantile" and "pmf". For this reason, adding the \code{output_type} column should +be done via the \code{...} argument. See the examples below. The specific required +format for this forecast task is \href{https://github.com/cdcepi/FluSight-forecast-hub/blob/main/model-output/README.md}{here}. +} +\examples{ +library(dplyr) +weekly_deaths <- case_death_rate_subset \%>\% + select(geo_value, time_value, death_rate) \%>\% + left_join(state_census \%>\% select(pop, abbr), by = c("geo_value" = "abbr")) \%>\% + mutate(deaths = pmax(death_rate / 1e5 * pop * 7, 0)) \%>\% + select(-pop, -death_rate) \%>\% + group_by(geo_value) \%>\% + epi_slide(~ sum(.$deaths), before = 6, new_col_name = "deaths") \%>\% + ungroup() \%>\% + filter(weekdays(time_value) == "Saturday") + +cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths") +flusight_hub_formatter(cdc) +flusight_hub_formatter(cdc, target = "wk inc covid deaths") +flusight_hub_formatter(cdc, target = paste(horizon, "wk inc covid deaths")) +flusight_hub_formatter(cdc, target = "wk inc covid deaths", output_type = "quantile") +} diff --git a/man/frosting.Rd b/man/frosting.Rd index 83a8d6a9d..362c40a4f 100644 --- a/man/frosting.Rd +++ b/man/frosting.Rd @@ -24,8 +24,8 @@ The arguments are currently placeholders and must be NULL \examples{ # Toy example to show that frosting can be created and added for postprocessing - f <- frosting() - wf <- epi_workflow() \%>\% add_frosting(f) +f <- frosting() +wf <- epi_workflow() \%>\% add_frosting(f) # A more realistic example jhu <- case_death_rate_subset \%>\% diff --git a/man/layer_add_forecast_date.Rd b/man/layer_add_forecast_date.Rd index 421978eb5..4e173d662 100644 --- a/man/layer_add_forecast_date.Rd +++ b/man/layer_add_forecast_date.Rd @@ -46,15 +46,17 @@ wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) latest <- jhu \%>\% dplyr::filter(time_value >= max(time_value) - 14) - # Don't specify `forecast_date` (by default, this should be last date in latest) -f <- frosting() \%>\% layer_predict() \%>\% - layer_naomit(.pred) +# Don't specify `forecast_date` (by default, this should be last date in latest) +f <- frosting() \%>\% + layer_predict() \%>\% + layer_naomit(.pred) wf0 <- wf \%>\% add_frosting(f) p0 <- predict(wf0, latest) p0 # Specify a `forecast_date` that is greater than or equal to `as_of` date -f <- frosting() \%>\% layer_predict() \%>\% +f <- frosting() \%>\% + layer_predict() \%>\% layer_add_forecast_date(forecast_date = "2022-05-31") \%>\% layer_naomit(.pred) wf1 <- wf \%>\% add_frosting(f) @@ -73,7 +75,7 @@ p2 <- predict(wf2, latest) p2 # Do not specify a forecast_date - f3 <- frosting() \%>\% +f3 <- frosting() \%>\% layer_predict() \%>\% layer_add_forecast_date() \%>\% layer_naomit(.pred) diff --git a/man/layer_add_target_date.Rd b/man/layer_add_target_date.Rd index 58ff7770f..3c2884e10 100644 --- a/man/layer_add_target_date.Rd +++ b/man/layer_add_target_date.Rd @@ -48,7 +48,8 @@ wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) latest <- get_test_data(r, jhu) # Use ahead + forecast date -f <- frosting() \%>\% layer_predict() \%>\% +f <- frosting() \%>\% + layer_predict() \%>\% layer_add_forecast_date(forecast_date = "2022-05-31") \%>\% layer_add_target_date() \%>\% layer_naomit(.pred) @@ -59,7 +60,8 @@ p # Use ahead + max time value from pre, fit, post # which is the same if include `layer_add_forecast_date()` -f2 <- frosting() \%>\% layer_predict() \%>\% +f2 <- frosting() \%>\% + layer_predict() \%>\% layer_add_target_date() \%>\% layer_naomit(.pred) wf2 <- wf \%>\% add_frosting(f2) diff --git a/man/layer_cdc_flatline_quantiles.Rd b/man/layer_cdc_flatline_quantiles.Rd new file mode 100644 index 000000000..55a1a378e --- /dev/null +++ b/man/layer_cdc_flatline_quantiles.Rd @@ -0,0 +1,135 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/layer_cdc_flatline_quantiles.R +\name{layer_cdc_flatline_quantiles} +\alias{layer_cdc_flatline_quantiles} +\title{CDC Flatline Forecast Quantiles} +\usage{ +layer_cdc_flatline_quantiles( + frosting, + ..., + aheads = 1:4, + quantile_levels = c(0.01, 0.025, 1:19/20, 0.975, 0.99), + nsims = 1000, + by_key = "geo_value", + symmetrize = FALSE, + nonneg = TRUE, + id = rand_id("cdc_baseline_bands") +) +} +\arguments{ +\item{frosting}{a \code{frosting} postprocessor} + +\item{...}{Unused, include for consistency with other layers.} + +\item{aheads}{Numeric vector of desired forecast horizons. These should be +given in the "units of the training data". So, for example, for data +typically observed daily (possibly with missing values), but with weekly +forecast targets, you would use \code{c(7, 14, 21, 28)}. But with weekly data, +you would use \code{1:4}.} + +\item{quantile_levels}{Numeric vector of probabilities with values in (0,1) +referring to the desired predictive intervals. The default is the standard +set for the COVID Forecast Hub.} + +\item{nsims}{Positive integer. The number of draws from the empirical CDF. +These samples are spaced evenly on the (0, 1) scale, F_X(x) resulting in +linear interpolation on the X scale. This is achieved with +\code{\link[stats:quantile]{stats::quantile()}} Type 7 (the default for that function).} + +\item{by_key}{A character vector of keys to group the residuals by before +calculating quantiles. The default, \code{c()} performs no grouping.} + +\item{symmetrize}{Scalar logical. If \code{TRUE}, does two things: (i) forces the +"empirical" CDF of residuals to be symmetric by pretending that for every +actually-observed residual X we also observed another residual -X, and (ii) +at each ahead, forces the median simulated value to be equal to the point +prediction by adding or subtracting the same amount to every simulated +value. Adjustments in (ii) take place before propagating forward and +simulating the next ahead. This forces any 1-ahead predictive intervals to +be symmetric about the point prediction, and encourages larger aheads to be +more symmetric.} + +\item{nonneg}{Scalar logical. Force all predictive intervals be non-negative. +Because non-negativity is forced \emph{before} propagating forward, this has +slightly different behaviour than would occur if using \code{\link[=layer_threshold]{layer_threshold()}}. +Thresholding at each ahead takes place after any shifting from +\code{symmetrize}.} + +\item{id}{a random id string} +} +\value{ +an updated \code{frosting} postprocessor. Calling \code{\link[=predict]{predict()}} will result +in an additional \verb{} named \code{.pred_distn_all} containing 2-column +\code{\link[tibble:tibble]{tibble::tibble()}}'s. For each +desired combination of \code{key}'s, the tibble will contain one row per ahead +with the associated \code{\link[=dist_quantiles]{dist_quantiles()}}. +} +\description{ +This layer creates quantile forecasts by taking a sample from the +interpolated CDF of the flatline residuals, and shuffling them. +These are then added on to the point prediction. +} +\details{ +This layer is intended to be used in concert with \code{\link[=flatline]{flatline()}}. But it can +also be used with anything else. As long as residuals are available in the +the fitted model, this layer could be useful. Like +\code{\link[=layer_residual_quantiles]{layer_residual_quantiles()}} it only uses the residuals for the fitted model +object. However, it propagates these forward for \emph{all} aheads, by +iteratively shuffling them (randomly), and then adding them to the previous +set. This is in contrast to what happens with the \code{\link[=flatline_forecaster]{flatline_forecaster()}}. +When using \code{\link[=flatline]{flatline()}} as the underlying engine (here), both will result in the +same predictions (the most recent observed value), but that model calculates +separate residuals for each \code{ahead} by comparing to observations further into +the future. This version continues to use the same set of residuals, and +adds them on to produce wider intervals as \code{ahead} increases. +} +\examples{ +r <- epi_recipe(case_death_rate_subset) \%>\% + # data is "daily", so we fit this to 1 ahead, the result will contain + # 1 day ahead residuals + step_epi_ahead(death_rate, ahead = 1L, skip = TRUE) \%>\% + recipes::update_role(death_rate, new_role = "predictor") \%>\% + recipes::add_role(time_value, geo_value, new_role = "predictor") + +forecast_date <- max(case_death_rate_subset$time_value) + +latest <- get_test_data( + epi_recipe(case_death_rate_subset), case_death_rate_subset +) + +f <- frosting() \%>\% + layer_predict() \%>\% + layer_cdc_flatline_quantiles(aheads = c(7, 14, 21, 28), symmetrize = TRUE) + +eng <- parsnip::linear_reg() \%>\% parsnip::set_engine("flatline") + +wf <- epi_workflow(r, eng, f) \%>\% fit(case_death_rate_subset) +preds <- suppressWarnings(predict(wf, new_data = latest)) \%>\% + dplyr::select(-time_value) \%>\% + dplyr::mutate(forecast_date = forecast_date) +preds + +preds <- preds \%>\% + unnest(.pred_distn_all) \%>\% + pivot_quantiles(.pred_distn) \%>\% + mutate(target_date = forecast_date + ahead) + +if (require("ggplot2")) { +four_states <- c("ca", "pa", "wa", "ny") +preds \%>\% + filter(geo_value \%in\% four_states) \%>\% + ggplot(aes(target_date)) + + geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + + geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + + geom_line(aes(y = .pred), color = "orange") + + geom_line( + data = case_death_rate_subset \%>\% filter(geo_value \%in\% four_states), + aes(x = time_value, y = death_rate) + ) + + scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + + labs(x = "Date", y = "Death rate") + + facet_wrap(~geo_value, scales = "free_y") + + theme_bw() + + geom_vline(xintercept = forecast_date) +} +} diff --git a/man/layer_population_scaling.Rd b/man/layer_population_scaling.Rd index e841e9a50..179d6862c 100644 --- a/man/layer_population_scaling.Rd +++ b/man/layer_population_scaling.Rd @@ -78,13 +78,15 @@ jhu <- epiprocess::jhu_csse_daily_subset \%>\% dplyr::filter(time_value > "2021-11-01", geo_value \%in\% c("ca", "ny")) \%>\% dplyr::select(geo_value, time_value, cases) -pop_data = data.frame(states = c("ca", "ny"), value = c(20000, 30000)) +pop_data <- data.frame(states = c("ca", "ny"), value = c(20000, 30000)) r <- epi_recipe(jhu) \%>\% - step_population_scaling(df = pop_data, - df_pop_col = "value", - by = c("geo_value" = "states"), - cases, suffix = "_scaled") \%>\% + step_population_scaling( + df = pop_data, + df_pop_col = "value", + by = c("geo_value" = "states"), + cases, suffix = "_scaled" + ) \%>\% step_epi_lag(cases_scaled, lag = c(0, 7, 14)) \%>\% step_epi_ahead(cases_scaled, ahead = 7, role = "outcome") \%>\% step_epi_naomit() @@ -93,9 +95,11 @@ f <- frosting() \%>\% layer_predict() \%>\% layer_threshold(.pred) \%>\% layer_naomit(.pred) \%>\% - layer_population_scaling(.pred, df = pop_data, - by = c("geo_value" = "states"), - df_pop_col = "value") + layer_population_scaling(.pred, + df = pop_data, + by = c("geo_value" = "states"), + df_pop_col = "value" + ) wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) \%>\% @@ -104,9 +108,12 @@ wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% latest <- get_test_data( recipe = r, x = epiprocess::jhu_csse_daily_subset \%>\% - dplyr::filter(time_value > "2021-11-01", - geo_value \%in\% c("ca", "ny")) \%>\% - dplyr::select(geo_value, time_value, cases)) + dplyr::filter( + time_value > "2021-11-01", + geo_value \%in\% c("ca", "ny") + ) \%>\% + dplyr::select(geo_value, time_value, cases) +) predict(wf, latest) } diff --git a/man/layer_predict.Rd b/man/layer_predict.Rd index 1326dfe75..03473053f 100644 --- a/man/layer_predict.Rd +++ b/man/layer_predict.Rd @@ -62,9 +62,9 @@ jhu <- case_death_rate_subset \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) r <- epi_recipe(jhu) \%>\% - step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% - step_epi_ahead(death_rate, ahead = 7) \%>\% - step_epi_naomit() + step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% + step_epi_ahead(death_rate, ahead = 7) \%>\% + step_epi_naomit() wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) latest <- jhu \%>\% filter(time_value >= max(time_value) - 14) diff --git a/man/nested_quantiles.Rd b/man/nested_quantiles.Rd index 1a2824041..c4b578c1a 100644 --- a/man/nested_quantiles.Rd +++ b/man/nested_quantiles.Rd @@ -16,8 +16,8 @@ a list-col Turn a vector of quantile distributions into a list-col } \examples{ -edf <- case_death_rate_subset[1:3,] -edf$q <- dist_quantiles(list(1:5, 2:4, 3:10), list(1:5/6, 2:4/5, 3:10/11)) +edf <- case_death_rate_subset[1:3, ] +edf$q <- dist_quantiles(list(1:5, 2:4, 3:10), list(1:5 / 6, 2:4 / 5, 3:10 / 11)) edf_nested <- edf \%>\% dplyr::mutate(q = nested_quantiles(q)) edf_nested \%>\% tidyr::unnest(q) diff --git a/man/smooth_quantile_reg.Rd b/man/smooth_quantile_reg.Rd index 293999876..b938541f1 100644 --- a/man/smooth_quantile_reg.Rd +++ b/man/smooth_quantile_reg.Rd @@ -39,9 +39,10 @@ only supported engine is \code{\link[smoothqr:smooth_qr]{smoothqr::smooth_qr()}} tib <- data.frame( y1 = rnorm(100), y2 = rnorm(100), y3 = rnorm(100), y4 = rnorm(100), y5 = rnorm(100), y6 = rnorm(100), - x1 = rnorm(100), x2 = rnorm(100)) + x1 = rnorm(100), x2 = rnorm(100) +) qr_spec <- smooth_quantile_reg(tau = c(.2, .5, .8), outcome_locations = 1:6) -ff <- qr_spec \%>\% fit(cbind(y1, y2 , y3 , y4 , y5 , y6) ~ ., data = tib) +ff <- qr_spec \%>\% fit(cbind(y1, y2, y3, y4, y5, y6) ~ ., data = tib) p <- predict(ff, new_data = tib) x <- -99:99 / 100 * 2 * pi @@ -50,28 +51,31 @@ fd <- x[length(x) - 20] XY <- smoothqr::lagmat(y[1:(length(y) - 20)], c(-20:20)) XY <- tibble::as_tibble(XY) qr_spec <- smooth_quantile_reg(tau = c(.2, .5, .8), outcome_locations = 20:1) -tt <- qr_spec \%>\% fit_xy(x = XY[,21:41], y = XY[,1:20]) +tt <- qr_spec \%>\% fit_xy(x = XY[, 21:41], y = XY[, 1:20]) library(tidyr) library(dplyr) pl <- predict( - object = tt, - new_data = XY[max(which(complete.cases(XY[,21:41]))), 21:41] - ) + object = tt, + new_data = XY[max(which(complete.cases(XY[, 21:41]))), 21:41] +) pl <- pl \%>\% - unnest(.pred) \%>\% - mutate(distn = nested_quantiles(distn)) \%>\% - unnest(distn) \%>\% - mutate(x = x[length(x) - 20] + ahead / 100 * 2 * pi, - ahead = NULL) \%>\% - pivot_wider(names_from = tau, values_from = q) + unnest(.pred) \%>\% + mutate(distn = nested_quantiles(distn)) \%>\% + unnest(distn) \%>\% + mutate( + x = x[length(x) - 20] + ahead / 100 * 2 * pi, + ahead = NULL + ) \%>\% + pivot_wider(names_from = tau, values_from = q) plot(x, y, pch = 16, xlim = c(pi, 2 * pi), col = "lightgrey") curve(sin(x), add = TRUE) abline(v = fd, lty = 2) lines(pl$x, pl$`0.2`, col = "blue") lines(pl$x, pl$`0.8`, col = "blue") lines(pl$x, pl$`0.5`, col = "red") -\dontrun{ + +if (require("ggplot2")) { ggplot(data.frame(x = x, y = y), aes(x)) + geom_ribbon(data = pl, aes(ymin = `0.2`, ymax = `0.8`), fill = "lightblue") + geom_point(aes(y = y), colour = "grey") + # observed data diff --git a/man/step_epi_shift.Rd b/man/step_epi_shift.Rd index ca8609b1e..bf135346e 100644 --- a/man/step_epi_shift.Rd +++ b/man/step_epi_shift.Rd @@ -90,7 +90,7 @@ are always set to \code{"ahead_"} and \code{"epi_ahead"} respectively, while for \examples{ r <- epi_recipe(case_death_rate_subset) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% - step_epi_lag(death_rate, lag = c(0,7,14)) + step_epi_lag(death_rate, lag = c(0, 7, 14)) r } \seealso{ diff --git a/man/step_growth_rate.Rd b/man/step_growth_rate.Rd index 0449b887c..b409135b1 100644 --- a/man/step_growth_rate.Rd +++ b/man/step_growth_rate.Rd @@ -87,7 +87,9 @@ r <- epi_recipe(case_death_rate_subset) \%>\% step_growth_rate(case_rate, death_rate) r -r \%>\% recipes::prep() \%>\% recipes::bake(case_death_rate_subset) +r \%>\% + recipes::prep() \%>\% + recipes::bake(case_death_rate_subset) } \seealso{ Other row operation steps: diff --git a/man/step_lag_difference.Rd b/man/step_lag_difference.Rd index d69c25faa..b06abe43c 100644 --- a/man/step_lag_difference.Rd +++ b/man/step_lag_difference.Rd @@ -59,7 +59,9 @@ r <- epi_recipe(case_death_rate_subset) \%>\% step_lag_difference(case_rate, death_rate, horizon = c(7, 14)) r -r \%>\% recipes::prep() \%>\% recipes::bake(case_death_rate_subset) +r \%>\% + recipes::prep() \%>\% + recipes::bake(case_death_rate_subset) } \seealso{ Other row operation steps: diff --git a/man/step_population_scaling.Rd b/man/step_population_scaling.Rd index 2964c6912..1a9564563 100644 --- a/man/step_population_scaling.Rd +++ b/man/step_population_scaling.Rd @@ -104,13 +104,15 @@ jhu <- epiprocess::jhu_csse_daily_subset \%>\% dplyr::filter(time_value > "2021-11-01", geo_value \%in\% c("ca", "ny")) \%>\% dplyr::select(geo_value, time_value, cases) -pop_data = data.frame(states = c("ca", "ny"), value = c(20000, 30000)) +pop_data <- data.frame(states = c("ca", "ny"), value = c(20000, 30000)) r <- epi_recipe(jhu) \%>\% - step_population_scaling(df = pop_data, - df_pop_col = "value", - by = c("geo_value" = "states"), - cases, suffix = "_scaled") \%>\% + step_population_scaling( + df = pop_data, + df_pop_col = "value", + by = c("geo_value" = "states"), + cases, suffix = "_scaled" + ) \%>\% step_epi_lag(cases_scaled, lag = c(0, 7, 14)) \%>\% step_epi_ahead(cases_scaled, ahead = 7, role = "outcome") \%>\% step_epi_naomit() @@ -119,9 +121,11 @@ f <- frosting() \%>\% layer_predict() \%>\% layer_threshold(.pred) \%>\% layer_naomit(.pred) \%>\% - layer_population_scaling(.pred, df = pop_data, - by = c("geo_value" = "states"), - df_pop_col = "value") + layer_population_scaling(.pred, + df = pop_data, + by = c("geo_value" = "states"), + df_pop_col = "value" + ) wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) \%>\% @@ -130,8 +134,10 @@ wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% latest <- get_test_data( recipe = r, epiprocess::jhu_csse_daily_subset \%>\% - dplyr::filter(time_value > "2021-11-01", - geo_value \%in\% c("ca", "ny")) \%>\% + dplyr::filter( + time_value > "2021-11-01", + geo_value \%in\% c("ca", "ny") + ) \%>\% dplyr::select(geo_value, time_value, cases) ) diff --git a/man/step_training_window.Rd b/man/step_training_window.Rd index 7861f27ea..ce7c0fc74 100644 --- a/man/step_training_window.Rd +++ b/man/step_training_window.Rd @@ -50,9 +50,12 @@ after any filtering step. tib <- tibble::tibble( x = 1:10, y = 1:10, - time_value = rep(seq(as.Date("2020-01-01"), by = 1, - length.out = 5), times = 2), - geo_value = rep(c("ca", "hi"), each = 5)) \%>\% + time_value = rep(seq(as.Date("2020-01-01"), + by = 1, + length.out = 5 + ), times = 2), + geo_value = rep(c("ca", "hi"), each = 5) +) \%>\% as_epi_df() epi_recipe(y ~ x, data = tib) \%>\% diff --git a/tests/testthat/test-parse_period.R b/tests/testthat/test-parse_period.R new file mode 100644 index 000000000..0adbcec3d --- /dev/null +++ b/tests/testthat/test-parse_period.R @@ -0,0 +1,12 @@ +test_that("parse_period works", { + expect_error(parse_period(c(1, 2))) + expect_error(parse_period(c(1.3))) + expect_error(parse_period("1 year")) + expect_error(parse_period("2 weeks later")) + expect_identical(parse_period(1), 1L) + expect_identical(parse_period("1 day"), 1L) + expect_identical(parse_period("1 days"), 1L) + expect_identical(parse_period("1 week"), 7L) + expect_identical(parse_period("1 weeks"), 7L) + expect_identical(parse_period("2 weeks"), 14L) +}) diff --git a/tests/testthat/test-propagate_samples.R b/tests/testthat/test-propagate_samples.R new file mode 100644 index 000000000..5278ab385 --- /dev/null +++ b/tests/testthat/test-propagate_samples.R @@ -0,0 +1,7 @@ +test_that("propagate_samples", { + r <- -30:50 + p <- 40 + quantiles <- 1:9 / 10 + aheads <- c(2, 4, 7) + nsim <- 100 +}) diff --git a/tests/testthat/test-shuffle.R b/tests/testthat/test-shuffle.R new file mode 100644 index 000000000..94bc1aa3b --- /dev/null +++ b/tests/testthat/test-shuffle.R @@ -0,0 +1,5 @@ +test_that("shuffle works", { + expect_error(shuffle(matrix(NA, 2, 2))) + expect_length(shuffle(1:10), 10L) + expect_identical(sort(shuffle(1:10)), 1:10) +})