From dff767efb88069f5795310cc9b3be789a251a98f Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Thu, 7 Mar 2024 15:27:44 -0800 Subject: [PATCH 01/25] add period utils --- NAMESPACE | 1 + R/period.R | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 R/period.R diff --git a/NAMESPACE b/NAMESPACE index 3c63145b3..89a522d66 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -43,6 +43,7 @@ S3method(fit,epi_workflow) S3method(flusight_hub_formatter,canned_epipred) S3method(flusight_hub_formatter,data.frame) S3method(format,dist_quantiles) +S3method(format,period) S3method(is.na,dist_quantiles) S3method(is.na,distribution) S3method(mean,dist_quantiles) diff --git a/R/period.R b/R/period.R new file mode 100644 index 000000000..af34a5896 --- /dev/null +++ b/R/period.R @@ -0,0 +1,41 @@ +new_period <- function(x) { + arg_is_scalar(x) + arg_is_chr(x) + + n <- as.integer(strextract("^[0-9]+", x)) + names_in <- tolower(strextract("[a-zA-Z]+$", x)) + names_allowed <- paste0(rlang::fn_fmls_names(default_period), "s") + if (length(n) == 0L || length(names_in) == 0L || + is.na(pmatch(names_in, names_allowed))) { + cli_abort(c( + "Requested periodicity {.var {names_in}} is not available.", + i = "Input must be a positive integer followed by one of {.val {names_allowed}}." + )) + } + names_in <- gsub("s$", "", names_in) + l <- rlang::list2(!!names_in := n) + res <- eval(rlang::call2("default_period", !!!l)) + vctrs::new_rcrd(res, class = "period") +} + +default_period <- function(year = 0L, quarter = 0L, month = 0L, epiweek = 0L, + week = 0L, day = 0L, hours = 0L, minutes = 0L, + seconds = 0L) { + enlist( + year = year, month = month + 3L * quarter, epiweek = epiweek, + day = day + 7L * week, hours = hours, minutes = minutes, seconds = seconds + ) +} + +#' @method format period +#' @export +format.period <- function(x, ...) { + nms <- c("Y", "M", "EW", "D", "h", "m", "s") + val <- vctrs::vec_c(!!!vctrs::vec_data(x)) + paste0(val[val != 0], nms[val != 0]) +} + +strextract <- function(pattern, x) { + m <- regexec(pattern, x) + unlist(regmatches(x, m)) +} From 06678456e80906644b21365ee12753318d0007e4 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Fri, 8 Mar 2024 14:35:55 -0800 Subject: [PATCH 02/25] ensure joins happen silently --- R/layer_population_scaling.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/R/layer_population_scaling.R b/R/layer_population_scaling.R index 2b7057bef..750ced0ee 100644 --- a/R/layer_population_scaling.R +++ b/R/layer_population_scaling.R @@ -144,6 +144,12 @@ slather.layer_population_scaling <- length(object$df_pop_col) == 1 ) + if (is.null(object$by)) { + object$by <- intersect( + kill_time_value(epi_keys(components$predictions)), + colnames(dplyr::select(object$df, !object$df_pop_col)) + ) + } try_join <- try( dplyr::left_join(components$predictions, object$df, by = object$by From edd5134144c9a6c0b7d93e3bc615c2c77d20eda6 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sat, 9 Mar 2024 07:40:54 -0800 Subject: [PATCH 03/25] fix time_type processing in forecast/target date --- R/epi_workflow.R | 7 +++-- R/layer_add_forecast_date.R | 27 ++++++++++--------- R/layer_add_target_date.R | 52 +++++++++++++++++++++++------------- R/layer_population_scaling.R | 5 ++-- R/period.R | 41 ---------------------------- R/step_epi_shift.R | 21 ++++++++------- R/time_types.R | 46 +++++++++++++++++++++++++++++++ 7 files changed, 110 insertions(+), 89 deletions(-) delete mode 100644 R/period.R create mode 100644 R/time_types.R diff --git a/R/epi_workflow.R b/R/epi_workflow.R index d5e7d13a2..0fca5dc6c 100644 --- a/R/epi_workflow.R +++ b/R/epi_workflow.R @@ -251,11 +251,10 @@ fit.epi_workflow <- function(object, data, ..., control = workflows::control_wor #' preds predict.epi_workflow <- function(object, new_data, ...) { if (!workflows::is_trained_workflow(object)) { - rlang::abort( - c("Can't predict on an untrained epi_workflow.", + cli::cli_abort(c( + "Can't predict on an untrained epi_workflow.", i = "Do you need to call `fit()`?" - ) - ) + )) } components <- list() components$mold <- workflows::extract_mold(object) diff --git a/R/layer_add_forecast_date.R b/R/layer_add_forecast_date.R index 6bb2cf572..ea3635d79 100644 --- a/R/layer_add_forecast_date.R +++ b/R/layer_add_forecast_date.R @@ -78,8 +78,8 @@ layer_add_forecast_date <- } layer_add_forecast_date_new <- function(forecast_date, id) { - forecast_date <- arg_to_date(forecast_date, allow_null = TRUE) arg_is_chr_scalar(id) + # can't validate forecast_date until we know the time_type layer("add_forecast_date", forecast_date = forecast_date, id = id) } @@ -93,23 +93,24 @@ slather.layer_add_forecast_date <- function(object, components, workflow, new_da ) object$forecast_date <- max_time_value } - as_of_pre <- attributes(workflows::extract_preprocessor(workflow)$template)$metadata$as_of - as_of_fit <- workflow$fit$meta$as_of - as_of_post <- attributes(new_data)$metadata$as_of - as_of_date <- as.Date(max(as_of_pre, as_of_fit, as_of_post)) + expected_time_type <- attr( + workflows::extract_preprocessor(workflow)$template, "metadata" + )$time_type + if (expected_time_type == "week") expected_time_type = "day" + check <- validate_date(object$forecast_date, expected_time_type) - if (object$forecast_date < as_of_date) { - cli_warn( - c("The forecast_date is less than the most ", - "recent update date of the data: ", - i = "forecast_date = {object$forecast_date} while data is from {as_of_date}." - ) - ) + if (!check$ok) { + cli::cli_abort(c( + "The `forecast_date` was given as a {.val {check$x}} while the", + `!` = "`time_type` of the training data was {.val {check$expected}}.", + i = "See {.topic epiprocess::epi_df} for descriptions of these are determined." + )) } + components$predictions <- dplyr::bind_cols( components$predictions, - forecast_date = as.Date(object$forecast_date) + forecast_date = coerce_time_type(object$forecast_date, expected_time_type) ) components } diff --git a/R/layer_add_target_date.R b/R/layer_add_target_date.R index f2fee889f..6a48cc923 100644 --- a/R/layer_add_target_date.R +++ b/R/layer_add_target_date.R @@ -63,7 +63,7 @@ #' p3 layer_add_target_date <- function(frosting, target_date = NULL, id = rand_id("add_target_date")) { - target_date <- arg_to_date(target_date, allow_null = TRUE) + # can't validate target_date until we know the time_type arg_is_chr_scalar(id) add_layer( frosting, @@ -84,33 +84,47 @@ slather.layer_add_target_date <- function(object, components, workflow, new_data the_recipe <- workflows::extract_recipe(workflow) the_frosting <- extract_frosting(workflow) - if (!is.null(object$target_date)) { - target_date <- as.Date(object$target_date) - } else { # null target date case - if (detect_layer(the_frosting, "layer_add_forecast_date") && - !is.null(extract_argument( - the_frosting, - "layer_add_forecast_date", "forecast_date" - ))) { - forecast_date <- extract_argument( - the_frosting, - "layer_add_forecast_date", "forecast_date" - ) + expected_time_type <- attr( + workflows::extract_preprocessor(workflow)$template, "metadata" + )$time_type + if (expected_time_type == "week") expected_time_type = "day" - ahead <- extract_argument(the_recipe, "step_epi_ahead", "ahead") + browser() + if (!is.null(object$target_date)) { + check <- validate_date(object$target_date, expected_time_type) + if (!check$ok) { + cli::cli_abort(c( + "The `target_date` was given as a {.val {check$x}} while the", + `!` = "`time_type` of the training data was {.val {check$expected}}.", + i = "See {.topic epiprocess::epi_df} for descriptions of these are determined." + )) + } + target_date <- coerce_time_type(object$target_date, expected_time_type) + } else if ( + detect_layer(the_frosting, "layer_add_forecast_date") && + !is.null(possible_fd <- extract_argument( + the_frosting, "layer_add_forecast_date", "forecast_date" + ))) { - target_date <- forecast_date + ahead - } else { + check <- validate_date(possible_fd, expected_time_type) + if (!check$ok) { + cli::cli_abort(c( + "The `forecast_date` was given as a {.val {check$x}} while the", + `!` = "`time_type` of the training data was {.val {check$expected}}.", + i = "See {.topic epiprocess::epi_df} for descriptions of these are determined." + )) + } + forecast_date <- coerce_time_type(fd, expected_time_type) + ahead <- extract_argument(the_recipe, "step_epi_ahead", "ahead") + target_date <- forecast_date + ahead + } else { max_time_value <- max( workflows::extract_preprocessor(workflow)$max_time_value, workflow$fit$meta$max_time_value, max(new_data$time_value) ) - ahead <- extract_argument(the_recipe, "step_epi_ahead", "ahead") - target_date <- max_time_value + ahead - } } components$predictions <- dplyr::bind_cols(components$predictions, diff --git a/R/layer_population_scaling.R b/R/layer_population_scaling.R index 750ced0ee..1cdc35d7c 100644 --- a/R/layer_population_scaling.R +++ b/R/layer_population_scaling.R @@ -144,6 +144,7 @@ slather.layer_population_scaling <- length(object$df_pop_col) == 1 ) + browser() if (is.null(object$by)) { object$by <- intersect( kill_time_value(epi_keys(components$predictions)), @@ -163,8 +164,8 @@ slather.layer_population_scaling <- )) } - object$df <- object$df %>% - dplyr::mutate(dplyr::across(tidyselect::where(is.character), tolower)) + # object$df <- object$df %>% + # dplyr::mutate(dplyr::across(tidyselect::where(is.character), tolower)) pop_col <- rlang::sym(object$df_pop_col) exprs <- rlang::expr(c(!!!object$terms)) pos <- tidyselect::eval_select(exprs, components$predictions) diff --git a/R/period.R b/R/period.R deleted file mode 100644 index af34a5896..000000000 --- a/R/period.R +++ /dev/null @@ -1,41 +0,0 @@ -new_period <- function(x) { - arg_is_scalar(x) - arg_is_chr(x) - - n <- as.integer(strextract("^[0-9]+", x)) - names_in <- tolower(strextract("[a-zA-Z]+$", x)) - names_allowed <- paste0(rlang::fn_fmls_names(default_period), "s") - if (length(n) == 0L || length(names_in) == 0L || - is.na(pmatch(names_in, names_allowed))) { - cli_abort(c( - "Requested periodicity {.var {names_in}} is not available.", - i = "Input must be a positive integer followed by one of {.val {names_allowed}}." - )) - } - names_in <- gsub("s$", "", names_in) - l <- rlang::list2(!!names_in := n) - res <- eval(rlang::call2("default_period", !!!l)) - vctrs::new_rcrd(res, class = "period") -} - -default_period <- function(year = 0L, quarter = 0L, month = 0L, epiweek = 0L, - week = 0L, day = 0L, hours = 0L, minutes = 0L, - seconds = 0L) { - enlist( - year = year, month = month + 3L * quarter, epiweek = epiweek, - day = day + 7L * week, hours = hours, minutes = minutes, seconds = seconds - ) -} - -#' @method format period -#' @export -format.period <- function(x, ...) { - nms <- c("Y", "M", "EW", "D", "h", "m", "s") - val <- vctrs::vec_c(!!!vctrs::vec_data(x)) - paste0(val[val != 0], nms[val != 0]) -} - -strextract <- function(pattern, x) { - m <- regexec(pattern, x) - unlist(regmatches(x, m)) -} diff --git a/R/step_epi_shift.R b/R/step_epi_shift.R index ec5428d8f..94ef7178c 100644 --- a/R/step_epi_shift.R +++ b/R/step_epi_shift.R @@ -53,20 +53,20 @@ step_epi_lag <- function(recipe, ..., + lag, role = "predictor", trained = FALSE, - lag, prefix = "lag_", default = NA, columns = NULL, skip = FALSE, id = rand_id("epi_lag")) { if (!is_epi_recipe(recipe)) { - rlang::abort("This recipe step can only operate on an `epi_recipe`.") + cli::cli_abort("This step can only operate on an `epi_recipe`.") } if (missing(lag)) { - rlang::abort( + cli::cli_abort( c("The `lag` argument must not be empty.", i = "Did you perhaps pass an integer in `...` accidentally?" ) @@ -75,7 +75,8 @@ step_epi_lag <- arg_is_nonneg_int(lag) arg_is_chr_scalar(prefix, id) if (!is.null(columns)) { - rlang::abort(c("The `columns` argument must be `NULL.", + cli::cli_abort(c( + "The `columns` argument must be `NULL.", i = "Use `tidyselect` methods to choose columns to lag." )) } @@ -85,7 +86,7 @@ step_epi_lag <- terms = dplyr::enquos(...), role = role, trained = trained, - lag = lag, + lag = as.integer(lag), prefix = prefix, default = default, keys = epi_keys(recipe), @@ -104,21 +105,21 @@ step_epi_lag <- step_epi_ahead <- function(recipe, ..., + ahead, role = "outcome", trained = FALSE, - ahead, prefix = "ahead_", default = NA, columns = NULL, skip = FALSE, id = rand_id("epi_ahead")) { if (!is_epi_recipe(recipe)) { - rlang::abort("This recipe step can only operate on an `epi_recipe`.") + cli::cli_abort("This step can only operate on an `epi_recipe`.") } if (missing(ahead)) { - rlang::abort( - c("The `ahead` argument must not be empty.", + cli::cli_abort(c( + "The `ahead` argument must not be empty.", i = "Did you perhaps pass an integer in `...` accidentally?" ) ) @@ -136,7 +137,7 @@ step_epi_ahead <- terms = dplyr::enquos(...), role = role, trained = trained, - ahead = ahead, + ahead = as.integer(ahead), prefix = prefix, default = default, keys = epi_keys(recipe), diff --git a/R/time_types.R b/R/time_types.R new file mode 100644 index 000000000..02852696d --- /dev/null +++ b/R/time_types.R @@ -0,0 +1,46 @@ +guess_time_type <- function(time_value) { + if (is.character(time_value)) { + if (nchar(time_value[1]) <= "10") { + new_time_value <- tryCatch({ + as.Date(time_value) + }, error = function(e) NULL) + } else { + new_time_value <- tryCatch({ + as.POSIXct(time_value) + }, error = function(e) NULL) + } + if (!is.null(new_time_value)) time_value <- new_time_value + } + if (inherits(time_value, "POSIXct")) return("day-time") + if (inherits(time_value, "Date")) return("day") + if (inherits(time_value, "yearweek")) return("yearweek") + if (inherits(time_value, "yearmonth")) return("yearmonth") + if (inherits(time_value, "yearquarter")) return("yearquarter") + if (is.numeric(time_value) && all(time_value == as.integer(time_value)) && + all(time_value >= 1582)) { + return("year") + } + return("custom") +} + +coerce_time_type <- function(x, target_type) { + if (target_type == "year") { + if (is.numeric(x)) return(as.integer(x)) + else return(as.POSIXlt(x)$year + 1900L) + } + switch( + target_type, + "day-time" = as.POSIXct(x), + "day" = as.Date(x), + "week" = as.Date(x), + "yearweek" = tsibble::yearweek(x), + "yearmonth" = tsibble::yearmonth(x), + "yearquarter" = tsibble::yearquarter(x) + ) +} + +validate_date <- function(x, expected) { + x <- guess_time_type(x) + ok <- x == expected + enlist(ok, x, expected) +} From e3b2907db94784dc8636561bfdb543b9856d5384 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sat, 9 Mar 2024 07:41:05 -0800 Subject: [PATCH 04/25] import tsibble --- DESCRIPTION | 1 + NAMESPACE | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index e068fe7e3..68a566cee 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -46,6 +46,7 @@ Imports: tibble, tidyr, tidyselect, + tsibble, usethis, vctrs, workflows (>= 1.0.0) diff --git a/NAMESPACE b/NAMESPACE index 89a522d66..3c63145b3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -43,7 +43,6 @@ S3method(fit,epi_workflow) S3method(flusight_hub_formatter,canned_epipred) S3method(flusight_hub_formatter,data.frame) S3method(format,dist_quantiles) -S3method(format,period) S3method(is.na,dist_quantiles) S3method(is.na,distribution) S3method(mean,dist_quantiles) From a624b63244e71b2f45336ff47944760476be5239 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sat, 9 Mar 2024 07:41:26 -0800 Subject: [PATCH 05/25] use panel_data vignette for experiments, to be removed --- vignettes/panel-data.Rmd | 545 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 545 insertions(+) create mode 100644 vignettes/panel-data.Rmd diff --git a/vignettes/panel-data.Rmd b/vignettes/panel-data.Rmd new file mode 100644 index 000000000..39321ae4c --- /dev/null +++ b/vignettes/panel-data.Rmd @@ -0,0 +1,545 @@ +--- +title: "Using epipredict on non-epidemic panel data" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Using epipredict on non-epidemic panel data} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include=F} +knitr::opts_chunk$set( + echo = TRUE, + collapse = TRUE, + comment = "#>", + out.width = "100%" +) +``` + +```{r libraries} +library(dplyr) +library(tidyr) +library(parsnip) +library(recipes) +library(epiprocess) +library(epipredict) +library(ggplot2) +theme_set(theme_bw()) +``` + +[Panel data](https://en.wikipedia.org/wiki/Panel_data), or longitudinal data, +contain cross-sectional measurements of subjects over time. The `epipredict` +package is most suitable for running forecasters on epidemiological panel data. +A built-in example of this is the [`case_death_rate_subset`]( + https://cmu-delphi.github.io/epipredict/reference/case_death_rate_subset.html) +dataset, which contains daily state-wise measures of `case_rate` and +`death_rate` for COVID-19 in 2021: + +```{r epi-panel-ex, include=T} +head(case_death_rate_subset, 3) +``` + +`epipredict` functions work with data in +[`epi_df`](https://cmu-delphi.github.io/epiprocess/reference/epi_df.html) +format. Despite the stated goal and name of the package, other panel datasets +are also valid candidates for `epipredict` functionality, as long as they are +in `epi_df` format. + +```{r employ-stats, include=F} +data("grad_employ_subset") +year_start <- min(grad_employ_subset$time_value) +year_end <- max(grad_employ_subset$time_value) +``` + +# Example panel data overview + +In this vignette, we will demonstrate using `epipredict` with employment panel +data from Statistics Canada. We will be using +[ + Table 37-10-0115-01: Characteristics and median employment income of + longitudinal cohorts of postsecondary graduates two and five years after + graduation, by educational qualification and field of study (primary + groupings) +](https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=3710011501). + +The full dataset contains yearly median employment income two and five years +after graduation, and number of graduates. The data is stratified by +variables such as geographic region (Canadian province), education, and +age group. The year range of the dataset is `r year_start` to `r year_end`, +inclusive. The full dataset also contains metadata that describes the +quality of data collected. For demonstration purposes, we make the following +modifications to get a subset of the full dataset: + +* Only keep provincial-level geographic region (the full data also has +"Canada" as a region) +* Only keep "good" or better quality data rows, as indicated by the [`STATUS`]( + https://www.statcan.gc.ca/en/concepts/definitions/guide-symbol) column +* Choose a subset of covariates and aggregate across the remaining ones. The +chosen covariates are age group, and educational qualification. + +Below is the query for obtaining the full data and code for subsetting it as we +just described: + +```{r employ-query, eval=F} +library(cansim) + +# Get statcan data using get_cansim, which returns a tibble +statcan_grad_employ <- get_cansim("37-10-0115-01") + +gemploy <- statcan_grad_employ %>% + # Drop some columns and rename the ones we keep + select(c( + "REF_DATE", "GEO", "VALUE", "STATUS", "Educational qualification", + "Field of study", "Gender", "Age group", "Status of student in Canada", + "Characteristics after graduation", "Graduate statistics" + )) %>% + rename( + "geo_value" = "GEO", + "time_value" = "REF_DATE", + "value" = "VALUE", + "status" = "STATUS", + "edu_qual" = "Educational qualification", + "fos" = "Field of study", + "gender" = "Gender", + "age_group" = "Age group", + "student_status" = "Status of student in Canada", + "grad_charac" = "Characteristics after graduation", + "grad_stat" = "Graduate statistics" + ) %>% + # The original `VALUE` column contain the statistic indicated by + # `Graduate statistics` in the original data. Below we pivot the data + # wider so that each unique statistic can have its own column. + mutate( + # Recode for easier pivoting + grad_stat = recode_factor( + grad_stat, + `Number of graduates` = "num_graduates", + `Median employment income two years after graduation` = "med_income_2y", + `Median employment income five years after graduation` = "med_income_5y" + ), + # They are originally strings but want ints for conversion to epi_df later + time_value = as.integer(time_value) + ) %>% + pivot_wider(names_from = grad_stat, values_from = value) %>% + filter( + # Drop aggregates for some columns + geo_value != "Canada" & + age_group != "15 to 64 years" & + edu_qual != "Total, educational qualification" & + # Keep aggregates for keys we don't want to keep + fos == "Total, field of study" & + gender == "Total, gender" & + student_status == "Canadian and international students" & + # Since we're looking at 2y and 5y employment income, the only + # characteristics remaining are: + # - Graduates reporting employment income + # - Graduates reporting wages, salaries, and commissions only + # For simplicity, keep the first one only + grad_charac == "Graduates reporting employment income" & + # Only keep "good" data + is.na(status) & + # Drop NA value rows + !is.na(num_graduates) & !is.na(med_income_2y) & !is.na(med_income_5y) + ) %>% + select(-c(status, gender, student_status, grad_charac, fos)) +``` + +To use this data with `epipredict`, we need to convert it into `epi_df` format +using [`as_epi_df`]( + https://cmu-delphi.github.io/epiprocess/reference/as_epi_df.html) +with additional keys. In our case, the additional keys are `age_group`, +and `edu_qual`. Note that in the above modifications, we encoded `time_value` +as type `integer`. This lets us set `time_type = "year"`, and ensures that +lag and ahead modifications later on are using the correct time units. See the +[`epi_df` documentation]( + https://cmu-delphi.github.io/epiprocess/reference/epi_df.html#time-types) for +a list of all the `type_type`s available. + +```{r convert-to-epidf, eval=F} +grad_employ_subset <- gemploy %>% + tsibble::as_tsibble( + index = time_value, + key = c(geo_value, age_group, edu_qual) + ) %>% + as_epi_df( + geo_type = "custom", time_type = "year", + additional_metadata = c(other_keys = list("age_group", "edu_qual")) + ) +``` + +```{r data-dim, include=F} +employ_rowcount <- format(nrow(grad_employ_subset), big.mark = ",") +employ_colcount <- length(names(grad_employ_subset)) +``` + +Now, we are ready to use `grad_employ_subset` with `epipredict`. +Our `epi_df` contains `r employ_rowcount` rows and `r employ_colcount` columns. +Here is a quick summary of the columns in our `epi_df`: + +* `time_value` (time value): year in `date` format +* `geo_value` (geo value): province in Canada +* `num_graduates` (raw, time series value): number of graduates +* `med_income_2y` (raw, time series value): median employment income 2 years +after graduation +* `med_income_5y` (raw, time series value): median employment income 5 years +after graduation +* `age_group` (key): one of two age groups, either 15 to 34 years, or 35 to 64 +years +* `edu_qual` (key): one of 32 unique educational qualifications, e.g., +"Master's diploma" + +```{r preview-data, include=T} +# Rename for simplicity +employ <- grad_employ_subset +sample_n(employ, 6) +``` + +In the following sections, we will go over preprocessing the data in the +`epi_recipe` framework, and fitting a model and making predictions within the +`epipredict` framework and using the package's canned forecasters. + +# Simple autoregressive with 3 lags to predict number of graduates in a year + +## Preprocessing + +As a simple example, let's work with the `num_graduates` column for now. We will +first pre-process by "standardizing" each numeric column by the total within +each group of keys. We do this since those raw numeric values will vary greatly +from province to province since there are large differences in population. + +```{r employ-small, include=T} +employ_small <- employ %>% + group_by(geo_value, age_group, edu_qual) %>% + # Select groups where there are complete timeseries values + filter(n() >= 6) %>% + mutate( + num_graduates_prop = num_graduates / sum(num_graduates), + med_income_2y_prop = med_income_2y / sum(med_income_2y), + med_income_5y_prop = med_income_5y / sum(med_income_5y) + ) %>% + ungroup() +head(employ_small) +``` + +Below is a visualization for a sample of the small data. Note that some groups +do not have any time series information since we filtered out all timeseries +with incomplete dates. + +```{r employ-small-graph, include=T, eval=T, fig.width=9, fig.height=6} +employ_small %>% + filter(geo_value %in% c("British Columbia", "Ontario")) %>% + filter(grepl("degree", edu_qual, fixed = T)) %>% + group_by(geo_value, time_value, edu_qual, age_group) %>% + summarise(num_graduates_prop = sum(num_graduates_prop), .groups = "drop") %>% + ggplot(aes(x = time_value, y = num_graduates_prop, color = geo_value)) + + geom_line() + + scale_colour_manual(values = c("Cornflowerblue", "Orange"), name = "") + + facet_grid(rows = vars(edu_qual), cols = vars(age_group)) + + xlab("Year") + + ylab("Percentage of gratuates") + + theme(legend.position = "bottom") +``` + +We will predict the "standardized" number of graduates (a proportion) in the +next year (time $t+1$) using an autoregressive model with three lags (i.e., an +AR(3) model). Such a model is represented algebraically like this: + +\[ + x_{t+1} = + \alpha_0 + \alpha_1 x_{t} + \alpha_2 x_{t-1} + \alpha_3 x_{t-2} + \epsilon_t +\] + +where $x_i$ is the proportion of graduates at time $i$, and the current time is +$t$. + +In the preprocessing step, we need to create additional columns in `employ` for +each of $x_{t+1}$, $x_{t}$, $x_{t-1}$, and $x_{t-2}$. We do this via an +`epi_recipe`. Note that creating an `epi_recipe` alone doesn't add these +outcome and predictor columns; the recipe just stores the instructions for +adding them. + +Our `epi_recipe` should add one `ahead` column representing $x_{t+1}$ and +3 `lag` columns representing $x_{t}$, $x_{t-1}$, and $x_{t-2}$. Also note that +since we specified our `time_type` to be `year`, our `lag` and `lead` +values are both in years. + +```{r make-recipe, include=T, eval=T} +r <- epi_recipe(employ_small) %>% + step_epi_ahead(num_graduates_prop, ahead = 1) %>% + step_epi_lag(num_graduates_prop, lag = 0:2) %>% + step_epi_naomit() +r +``` + +Let's apply this recipe using `prep` and `bake` to generate and view the `lag` +and `ahead` columns. + +```{r view-preprocessed, include=T} +# Display a sample of the preprocessed data +bake_and_show_sample <- function(recipe, data, n = 5) { + recipe %>% prep(data) %>% bake(new_data = data) %>% sample_n(n) +} + +r %>% bake_and_show_sample(employ_small) +``` + +We can see that the `prep` and `bake` steps created new columns according to +our `epi_recipe`: + +- `ahead_1_num_graduates_prop` corresponds to $x_{t+1}$ +- `lag_0_num_graduates_prop`, `lag_1_num_graduates_prop`, and +`lag_2_num_graduates_prop` correspond to $x_{t}$, $x_{t-1}$, and $x_{t-2}$ +respectively. + +## Model fitting and prediction + +Since our goal for now is to fit a simple autoregressive model, we can use +[`parsnip::linear_reg()`]( + https://parsnip.tidymodels.org/reference/linear_reg.html) with the default +engine `lm`, which fits a linear regression using ordinary least squares. + +We will use `epi_workflow` with the `epi_recipe` we defined in the +preprocessing section along with the `parsnip::linear_reg()` model. Note again +that `epi_workflow` is a container and doesn't actually do the fitting. We have +to pass the workflow into `fit()` to get our model coefficients +$\alpha_i, i=0,...,3$. + +```{r linearreg-wf, include=T} +wf_linreg <- epi_workflow(r, linear_reg()) %>% + fit(employ_small) +summary(extract_fit_engine(wf_linreg)) +``` + +This output tells us the coefficients of the fitted model; for instance, +the intercept is $\alpha_0 = 0.24804$ and the coefficient for $x_{t}$ is +$\alpha_1 = 0.06648$. The summary also tells us that only the intercept and lags +at 2 years and 3 years ago have coefficients significantly greater than zero. + +Extracting the 95% confidence intervals for the coefficients also leads us to +the same conclusion: all the coefficients except for $\alpha_1$ (lag 0) contain +0. + +```{r} +confint(extract_fit_engine(wf_linreg)) +``` + +Now that we have our workflow, we can generate predictions from a subset of our +data. For this demo, we will predict the number of graduates using the last 2 +years of our dataset. + +```{r linearreg-predict, include=T} +latest <- get_test_data(recipe = r, x = employ_small) +preds <- stats::predict(wf_linreg, latest) %>% filter(!is.na(.pred)) +# Display a sample of the prediction values, excluding NAs +preds %>% sample_n(5) +``` + +We can do this using the `augment` function too. Note that `predict` and +`augment` both still return an `epi_df` with all of the keys that were present +in the original dataset. + +```{r linearreg-augment, include=T, eval=F} +augment(wf_linreg, latest) %>% sample_n(5) +``` + +## Model diagnostics + +First, we'll plot the residuals (that is, $y_{t} - \hat{y}_{t}$) against the +fitted values ($\hat{y}_{t}$). + +```{r lienarreg-resid-plot, include=T, fig.height=8} +par(mfrow = c(2, 2), mar = c(5, 3.5, 1, 1) + .5) +plot(extract_fit_engine(wf_linreg)) +``` + +The fitted values vs. residuals plot shows us that the residuals are mostly +clustered around zero, but do not form an even band around the zero line, +indicating that the variance of the residuals is not constant. Additionally, +the fitted values vs. square root of standardized residuals makes this more +obvious - the spread of the square root of standardized residuals varies with +the fitted values. + +The Q-Q plot shows us that the residuals have heavier tails than a Normal +distribution. So the normality of residuals assumption doesn't hold either. + +Finally, the residuals vs. leverage plot shows us that we have a few influential +points based on the Cooks distance (those outside the red dotted line). + +Since we appear to be violating the linear model assumptions, we might consider +transforming our data differently, or considering a non-linear model, or +something else. + +# Autoregressive model with exogenous inputs + +Now suppose we want to model the 5-year employment income using 3 lags, while +also incorporating information from the other two time-series in our dataset: +the 2-year employment income and the number of graduates in the previous 2 +years. We would do this using an autoregressive model with exogenous inputs, +defined as follows: + +\[ + y_{t+1} = + \alpha_0 + \alpha_1 y_{t} + \alpha_2 y_{t-1} + \alpha_3 y_{t-2} + + \beta_1 x_{t} + \beta_2 x_{t-1} + \gamma_2 z_{t} + \gamma_2 z_{t-1} +\] + +where $y_i$ is the 5-year median income (proportion) at time $i$, +$x_i$ is the 2-year median income (proportion) at time $i$, and +$z_i$ is the number of graduates (proportion) at time $i$. + +## Preprocessing + +Again, we construct an `epi_recipe` detailing the preprocessing steps. + +```{r custom-arx, include=T} +rx <- epi_recipe(employ_small) %>% + step_epi_ahead(med_income_5y_prop, ahead = 1) %>% + # 5-year median income has 3 lags c(0,1,2) + step_epi_lag(med_income_5y_prop, lag = c(0, 1, 2)) %>% + # But the two exogenous variables have 2 lags c(0,1) + step_epi_lag(med_income_2y_prop, lag = c(0, 1)) %>% + step_epi_lag(num_graduates_prop, lag = c(0, 1)) %>% + step_epi_naomit() + +bake_and_show_sample(rx, employ_small) +``` + +## Model fitting & postprocessing + +Before fitting our model and making predictions, let's add some post-processing +steps using a few [`frosting`]( + https://cmu-delphi.github.io/epipredict/reference/frosting.html) layers to do +a few things: + +1. Convert our predictions back to income values and number of graduates, +rather than standardized proportions. We do this via the frosting layer +`layer_population_scaling`. +2. Threshold our predictions to 0. We are predicting proportions, which can't +be negative. And the transformed values back to dollars and people can't be +negative either. +3. Generate prediction intervals based on residual quantiles, allowing us to +quantify the uncertainty associated with future predicted values. + +```{r custom-arx-post, include=T} +# Create dataframe of the sums we used for standardizing +# Only have to include med_income_5y since that is our outcome +totals <- employ_small %>% + group_by(geo_value, age_group, edu_qual) %>% + summarise(med_income_5y_tot = sum(med_income_5y), .groups = "drop") + +# Define post-processing steps +f <- frosting() %>% + layer_predict() %>% + layer_naomit(.pred) %>% + layer_threshold(.pred, lower = 0) %>% + # 90% prediction interval + layer_residual_quantiles( + quantile_levels = c(0.1, 0.9), + symmetrize = FALSE + ) %>% + layer_population_scaling( + .pred, .pred_distn, + df = totals, df_pop_col = "med_income_5y_tot" + ) + +wfx_linreg <- epi_workflow(rx, parsnip::linear_reg()) %>% + fit(employ_small) %>% + add_frosting(f) + +summary(extract_fit_engine(wfx_linreg)) +``` + +Based on the summary output for this model, only the intercept term, 5-year +median income from 3 years ago, and the 2-year median income from 1 year ago +are significant linear predictors for today's 5-year median income at a 95% +confidence level. Both lags for the number of graduates were insignificant. + +Let's take a look at the predictions along with their 90% prediction intervals. + +```{r} +latest <- get_test_data(recipe = rx, x = employ_small) +predsx <- predict(wfx_linreg, latest) + +# Display values within prediction intervals +predsx %>% + select( + geo_value, time_value, edu_qual, age_group, fos, + .pred_scaled, .pred_distn_scaled + ) %>% + head() %>% + pivot_quantiles_wider(.pred_distn_scaled) +``` + +# Using canned forecasters + +We've seen what we can do with non-epidemiological panel data using the +recipes frame, with `epi_recipe` for preprocessing, `epi_workflow` for model +fitting, and `frosting` for postprocessing. + +`epipredict` also comes with canned forecasters that do all of those steps +behind the scenes for some simple models. Even though we aren't working with +epidemiological data, canned forecasters still work as expected, out of the box. +We will demonstrate this with the simple +[`flatline_forecaster`]( + https://cmu-delphi.github.io/epipredict/reference/flatline_forecaster.html) +and the direct autoregressive (AR) forecaster +[`arx_forecaster`]( + https://cmu-delphi.github.io/epipredict/reference/arx_forecaster.html). + +For both illustrations, we will continue to use the `employ_small` dataset +with the transformed numeric columns that are proportions within each group +by the keys in our `epi_df`. + +## Flatline forecaster + +In this first example, we'll use `flatline_forecaster` to make a simple +prediction of the 2-year median income for the next year, based on one previous +time point. This model is representated algebraically as: +\[y_{t+1} = \alpha_0 + \alpha_0 y_{t}\] +where $y_i$ is the 2-year median income (proportion) at time $i$. + +```{r flatline, include=T, warning=F} +out_fl <- flatline_forecaster(employ_small, "med_income_2y_prop", + args_list = flatline_args_list(ahead = 1) +) + +out_fl +``` + +## Autoregressive forecaster with exogenous inputs + +In this second example, we'll use `arx_forecaster` to make a prediction of the +5-year median income based using two lags, _and_ using two lags on two exogenous +variables: 2-year median income and number of graduates. + +The canned forecaster gives us a simple way of making this forecast since it +defines the recipe, workflow, and post-processing steps behind the scenes. This +is very similar to the model we introduced in the "Autoregressive Linear Model +with Exogenous Inputs" section of this article, but where all inputs have the +same number of lags. + +```{r arx-lr, include=T, warning=F} +arx_args <- arx_args_list(lags = c(0L, 1L), ahead = 1L) + +out_arx_lr <- arx_forecaster(employ_small, "med_income_5y_prop", + c("med_income_5y_prop", "med_income_2y_prop", "num_graduates_prop"), + args_list = arx_args +) + +out_arx_lr +``` + +Other changes to the direct AR forecaster, like changing the engine, also work +as expected. Below we use a boosted tree model instead of a linear regression. + +```{r arx-rf, include=T, warning=F} +out_arx_rf <- arx_forecaster( + employ_small, "med_income_5y_prop", + c("med_income_5y_prop", "med_income_2y_prop", "num_graduates_prop"), + trainer = parsnip::boost_tree(mode = "regression", trees = 20), + args_list = arx_args +) + +out_arx_rf +``` From 69b21c67cb8b891b9377b0da1936e5eefa8205ed Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sat, 9 Mar 2024 08:01:35 -0800 Subject: [PATCH 06/25] remove browser() --- R/layer_add_target_date.R | 3 +-- R/layer_population_scaling.R | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/R/layer_add_target_date.R b/R/layer_add_target_date.R index 6a48cc923..8ab077efa 100644 --- a/R/layer_add_target_date.R +++ b/R/layer_add_target_date.R @@ -89,7 +89,6 @@ slather.layer_add_target_date <- function(object, components, workflow, new_data )$time_type if (expected_time_type == "week") expected_time_type = "day" - browser() if (!is.null(object$target_date)) { check <- validate_date(object$target_date, expected_time_type) if (!check$ok) { @@ -114,7 +113,7 @@ slather.layer_add_target_date <- function(object, components, workflow, new_data i = "See {.topic epiprocess::epi_df} for descriptions of these are determined." )) } - forecast_date <- coerce_time_type(fd, expected_time_type) + forecast_date <- coerce_time_type(possible_fd, expected_time_type) ahead <- extract_argument(the_recipe, "step_epi_ahead", "ahead") target_date <- forecast_date + ahead } else { diff --git a/R/layer_population_scaling.R b/R/layer_population_scaling.R index 1cdc35d7c..f89160794 100644 --- a/R/layer_population_scaling.R +++ b/R/layer_population_scaling.R @@ -144,7 +144,6 @@ slather.layer_population_scaling <- length(object$df_pop_col) == 1 ) - browser() if (is.null(object$by)) { object$by <- intersect( kill_time_value(epi_keys(components$predictions)), From 3498fcfdc614ccca36bdc8ebe7c5f0f4f9988c6e Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sat, 9 Mar 2024 08:01:54 -0800 Subject: [PATCH 07/25] prefix with utils to avoid check warning --- R/dist_quantiles.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/dist_quantiles.R b/R/dist_quantiles.R index 7f7af40f4..f20ebe3dc 100644 --- a/R/dist_quantiles.R +++ b/R/dist_quantiles.R @@ -233,10 +233,10 @@ quantile_extrapolate <- function(x, tau_out, middle) { dplyr::arrange(q) } if (any(indl)) { - qvals_out[indl] <- tail_extrapolate(tau_out[indl], head(qv, 2)) + qvals_out[indl] <- tail_extrapolate(tau_out[indl], utils::head(qv, 2)) } if (any(indr)) { - qvals_out[indr] <- tail_extrapolate(tau_out[indr], tail(qv, 2)) + qvals_out[indr] <- tail_extrapolate(tau_out[indr], utils::tail(qv, 2)) } qvals_out } From b2b8134d033d7af825cef30e5743c33acee93ebe Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sat, 9 Mar 2024 08:02:05 -0800 Subject: [PATCH 08/25] redocument --- man/step_epi_shift.Rd | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/man/step_epi_shift.Rd b/man/step_epi_shift.Rd index bf135346e..f4419b831 100644 --- a/man/step_epi_shift.Rd +++ b/man/step_epi_shift.Rd @@ -8,9 +8,9 @@ step_epi_lag( recipe, ..., + lag, role = "predictor", trained = FALSE, - lag, prefix = "lag_", default = NA, columns = NULL, @@ -21,9 +21,9 @@ step_epi_lag( step_epi_ahead( recipe, ..., + ahead, role = "outcome", trained = FALSE, - ahead, prefix = "ahead_", default = NA, columns = NULL, @@ -38,16 +38,16 @@ sequence of operations for this recipe.} \item{...}{One or more selector functions to choose variables for this step. See \code{\link[recipes:selections]{recipes::selections()}} for more details.} +\item{lag, ahead}{A vector of integers. Each specified column will +be the lag or lead for each value in the vector. Lag integers must be +nonnegative, while ahead integers must be positive.} + \item{role}{For model terms created by this step, what analysis role should they be assigned? \code{lag} is default a predictor while \code{ahead} is an outcome.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} -\item{lag, ahead}{A vector of integers. Each specified column will -be the lag or lead for each value in the vector. Lag integers must be -nonnegative, while ahead integers must be positive.} - \item{prefix}{A prefix to indicate what type of variable this is} \item{default}{Determines what fills empty rows From f2f39a264e5fbf96b9b7df7f7bdaba870c270147 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sat, 9 Mar 2024 08:02:39 -0800 Subject: [PATCH 09/25] ahead/lag will always be integer type now --- tests/testthat/test-extract_argument.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/testthat/test-extract_argument.R b/tests/testthat/test-extract_argument.R index 0654304ba..3250b2991 100644 --- a/tests/testthat/test-extract_argument.R +++ b/tests/testthat/test-extract_argument.R @@ -43,19 +43,19 @@ test_that("recipe argument extractor works", { expect_error(extract_argument(r$steps[[1]], "uhoh", "bubble")) expect_error(extract_argument(r$steps[[1]], "step_epi_lag", "bubble")) - expect_identical(extract_argument(r$steps[[2]], "step_epi_ahead", "ahead"), 7) + expect_identical(extract_argument(r$steps[[2]], "step_epi_ahead", "ahead"), 7L) expect_error(extract_argument(r, "step_lightly", "quantile_levels")) expect_identical( extract_argument(r, "step_epi_lag", "lag"), - list(c(0, 7, 14), c(0, 7, 14)) + list(c(0L, 7L, 14L), c(0L, 7L, 14L)) ) wf <- epi_workflow(preprocessor = r) expect_error(extract_argument(epi_workflow(), "step_epi_lag", "lag")) expect_identical( extract_argument(wf, "step_epi_lag", "lag"), - list(c(0, 7, 14), c(0, 7, 14)) + list(c(0L, 7L, 14L), c(0L, 7L, 14L)) ) }) From f9859fe880a0209ab7d3f1c8c454c0a9c2b3f1df Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sat, 9 Mar 2024 08:16:37 -0800 Subject: [PATCH 10/25] we no longer warn if as_of > forecast_date --- tests/testthat/test-layer_add_forecast_date.R | 25 +++++++++++-------- tests/testthat/test-layer_add_target_date.R | 3 ++- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/tests/testthat/test-layer_add_forecast_date.R b/tests/testthat/test-layer_add_forecast_date.R index 1830118dc..29a427c3a 100644 --- a/tests/testthat/test-layer_add_forecast_date.R +++ b/tests/testthat/test-layer_add_forecast_date.R @@ -11,8 +11,9 @@ latest <- jhu %>% test_that("layer validation works", { f <- frosting() - expect_error(layer_add_forecast_date(f, "a")) - expect_error(layer_add_forecast_date(f, "2022-05-31", id = c("a", "b"))) + + + expect_error(layer_add_forecast_date(f, "2022-05-31", id = 2)) expect_silent(layer_add_forecast_date(f, "2022-05-31")) expect_silent(layer_add_forecast_date(f)) expect_silent(layer_add_forecast_date(f, as.Date("2022-05-31"))) @@ -41,10 +42,12 @@ test_that("Specify a `forecast_date` that is less than `as_of` date", { layer_naomit(.pred) wf2 <- wf %>% add_frosting(f2) - expect_warning( - p2 <- predict(wf2, latest), - "forecast_date is less than the most recent update date of the data." - ) + # this warning has been removed + # expect_warning( + # p2 <- predict(wf2, latest), + # "forecast_date is less than the most recent update date of the data." + # ) + expect_silent(p2 <- predict(wf2, latest)) expect_equal(ncol(p2), 4L) expect_s3_class(p2, "epi_df") expect_equal(nrow(p2), 3L) @@ -59,10 +62,12 @@ test_that("Do not specify a forecast_date in `layer_add_forecast_date()`", { layer_naomit(.pred) wf3 <- wf %>% add_frosting(f3) - expect_warning( - p3 <- predict(wf3, latest), - "forecast_date is less than the most recent update date of the data." - ) + # this warning has been removed + # expect_warning( + # p3 <- predict(wf3, latest), + # "forecast_date is less than the most recent update date of the data." + # ) + expect_silent(p3 <- predict(wf3, latest)) expect_equal(ncol(p3), 4L) expect_s3_class(p3, "epi_df") expect_equal(nrow(p3), 3L) diff --git a/tests/testthat/test-layer_add_target_date.R b/tests/testthat/test-layer_add_target_date.R index 287956612..bf752e968 100644 --- a/tests/testthat/test-layer_add_target_date.R +++ b/tests/testthat/test-layer_add_target_date.R @@ -31,7 +31,8 @@ test_that("Use ahead + max time value from pre, fit, post", { layer_naomit(.pred) wf2 <- wf %>% add_frosting(f2) - expect_warning(p2 <- predict(wf2, latest)) + # expect_warning(p2 <- predict(wf2, latest)) # this warning has been removed + expect_silent(p2 <- predict(wf2, latest)) expect_equal(ncol(p2), 5L) expect_s3_class(p2, "epi_df") expect_equal(nrow(p2), 3L) From fb7faae256731bcf51a351d0a3d268289c4df0ed Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sat, 9 Mar 2024 08:17:01 -0800 Subject: [PATCH 11/25] upstream changes to prep() no require data --- tests/testthat/test-step_growth_rate.R | 8 ++++---- tests/testthat/test-step_lag_difference.R | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/testthat/test-step_growth_rate.R b/tests/testthat/test-step_growth_rate.R index d0dec170e..052141710 100644 --- a/tests/testthat/test-step_growth_rate.R +++ b/tests/testthat/test-step_growth_rate.R @@ -34,7 +34,7 @@ test_that("step_growth_rate works for a single signal", { res <- r %>% step_growth_rate(value, horizon = 1) %>% - prep() %>% + prep(edf) %>% bake(edf) expect_equal(res$gr_1_rel_change_value, c(NA, 1 / 6:9)) @@ -46,7 +46,7 @@ test_that("step_growth_rate works for a single signal", { r <- epi_recipe(edf) res <- r %>% step_growth_rate(value, horizon = 1) %>% - prep() %>% + prep(edf) %>% bake(edf) expect_equal(res$gr_1_rel_change_value, rep(c(NA, 1 / 6:9), each = 2)) }) @@ -63,7 +63,7 @@ test_that("step_growth_rate works for a two signals", { res <- r %>% step_growth_rate(v1, v2, horizon = 1) %>% - prep() %>% + prep(edf) %>% bake(edf) expect_equal(res$gr_1_rel_change_v1, c(NA, 1 / 6:9)) expect_equal(res$gr_1_rel_change_v2, c(NA, 1 / 1:4)) @@ -76,7 +76,7 @@ test_that("step_growth_rate works for a two signals", { r <- epi_recipe(edf) res <- r %>% step_growth_rate(v1, v2, horizon = 1) %>% - prep() %>% + prep(edf) %>% bake(edf) expect_equal(res$gr_1_rel_change_v1, rep(c(NA, 1 / 6:9), each = 2)) expect_equal(res$gr_1_rel_change_v2, rep(c(NA, 1 / 1:4), each = 2)) diff --git a/tests/testthat/test-step_lag_difference.R b/tests/testthat/test-step_lag_difference.R index dc61d12d4..c0fd377e6 100644 --- a/tests/testthat/test-step_lag_difference.R +++ b/tests/testthat/test-step_lag_difference.R @@ -27,13 +27,13 @@ test_that("step_lag_difference works for a single signal", { res <- r %>% step_lag_difference(value, horizon = 1) %>% - prep() %>% + prep(edf) %>% bake(edf) expect_equal(res$lag_diff_1_value, c(NA, rep(1, 4))) res <- r %>% step_lag_difference(value, horizon = 1:2) %>% - prep() %>% + prep(edf) %>% bake(edf) expect_equal(res$lag_diff_1_value, c(NA, rep(1, 4))) expect_equal(res$lag_diff_2_value, c(NA, NA, rep(2, 3))) @@ -48,7 +48,7 @@ test_that("step_lag_difference works for a single signal", { r <- epi_recipe(edf) res <- r %>% step_lag_difference(value, horizon = 1) %>% - prep() %>% + prep(edf) %>% bake(edf) expect_equal(res$lag_diff_1_value, c(NA, NA, rep(1, 8))) }) @@ -65,7 +65,7 @@ test_that("step_lag_difference works for a two signals", { res <- r %>% step_lag_difference(v1, v2, horizon = 1:2) %>% - prep() %>% + prep(edf) %>% bake(edf) expect_equal(res$lag_diff_1_v1, c(NA, rep(1, 4))) expect_equal(res$lag_diff_2_v1, c(NA, NA, rep(2, 3))) @@ -80,7 +80,7 @@ test_that("step_lag_difference works for a two signals", { r <- epi_recipe(edf) res <- r %>% step_lag_difference(v1, v2, horizon = 1:2) %>% - prep() %>% + prep(edf) %>% bake(edf) expect_equal(res$lag_diff_1_v1, rep(c(NA, rep(1, 4)), each = 2)) expect_equal(res$lag_diff_2_v1, rep(c(NA, NA, rep(2, 3)), each = 2)) From 78ff1f94b341758f24a80e3e236f0504e19418ab Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sat, 9 Mar 2024 08:19:05 -0800 Subject: [PATCH 12/25] remove unused test --- tests/testthat/test-propagate_samples.R | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 tests/testthat/test-propagate_samples.R diff --git a/tests/testthat/test-propagate_samples.R b/tests/testthat/test-propagate_samples.R deleted file mode 100644 index 5278ab385..000000000 --- a/tests/testthat/test-propagate_samples.R +++ /dev/null @@ -1,7 +0,0 @@ -test_that("propagate_samples", { - r <- -30:50 - p <- 40 - quantiles <- 1:9 / 10 - aheads <- c(2, 4, 7) - nsim <- 100 -}) From 3d059f42951db9f7a1e114b67d915712eda93046 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 18 Mar 2024 11:32:19 -0700 Subject: [PATCH 13/25] always a scalar --- R/time_types.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/time_types.R b/R/time_types.R index 02852696d..5f3aa4c14 100644 --- a/R/time_types.R +++ b/R/time_types.R @@ -1,6 +1,8 @@ guess_time_type <- function(time_value) { + # similar to epiprocess:::guess_time_type() but w/o the gap handling + arg_is_scalar(time_value) if (is.character(time_value)) { - if (nchar(time_value[1]) <= "10") { + if (nchar(time_value) <= "10") { new_time_value <- tryCatch({ as.Date(time_value) }, error = function(e) NULL) From de9e00a8163a9fc8028879d6c10b3bd37c408328 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 18 Mar 2024 11:36:26 -0700 Subject: [PATCH 14/25] add some additional tests --- tests/testthat/test-layer_add_forecast_date.R | 32 +++++++++++++++++++ tests/testthat/test-layer_add_target_date.R | 32 +++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/tests/testthat/test-layer_add_forecast_date.R b/tests/testthat/test-layer_add_forecast_date.R index 29a427c3a..93f13baf7 100644 --- a/tests/testthat/test-layer_add_forecast_date.R +++ b/tests/testthat/test-layer_add_forecast_date.R @@ -74,3 +74,35 @@ test_that("Do not specify a forecast_date in `layer_add_forecast_date()`", { expect_equal(p3$forecast_date, rep(as.Date("2021-12-31"), times = 3)) expect_named(p3, c("geo_value", "time_value", ".pred", "forecast_date")) }) + + +test_that("forecast date works for daily", { + f <- frosting() %>% + layer_predict() %>% + layer_add_forecast_date() %>% + layer_naomit(.pred) + + wf1 <- add_frosting(wf, f) + p <- predict(wf1, latest) + expect_identical(p$forecast_date[1], as.Date("2021-12-31")) + + latest_bad <- latest %>% + unclass() %>% + as.data.frame() %>% + mutate(time_value = as.POSIXlt(time_value)$year + 1900L) %>% + as_epi_df() + + expect_error(predict(wf1, latest_bad)) + wf1 <- add_frosting( + wf, + adjust_frosting(f, "layer_add_forecast_date", forecast_date = "2022-01-01") + ) + expect_silent(predict(wf1, latest)) + + wf1 <- add_frosting( + wf, + adjust_frosting(f, "layer_add_forecast_date", forecast_date = 2022L) + ) + expect_error(predict(wf1, latest)) # wrong time type of forecast_date + +}) diff --git a/tests/testthat/test-layer_add_target_date.R b/tests/testthat/test-layer_add_target_date.R index bf752e968..f3b706aa6 100644 --- a/tests/testthat/test-layer_add_target_date.R +++ b/tests/testthat/test-layer_add_target_date.R @@ -86,3 +86,35 @@ test_that("Specify own target date", { expect_equal(p2$target_date, rep(as.Date("2022-01-08"), times = 3)) expect_named(p2, c("geo_value", "time_value", ".pred", "target_date")) }) + +test_that("forecast date works for daily", { + f <- frosting() %>% + layer_predict() %>% + layer_add_target_date() %>% + layer_naomit(.pred) + + wf1 <- add_frosting(wf, f) + p <- predict(wf1, latest) + expect_identical(p$target_date[1], as.Date("2021-12-31") + 7L) + + latest_bad <- latest %>% + unclass() %>% + as.data.frame() %>% + mutate(time_value = as.POSIXlt(time_value)$year + 1900L) %>% + as_epi_df() + + expect_error(predict(wf1, latest_bad)) + wf1 <- add_frosting( + wf, + adjust_frosting(f, "layer_add_target_date", target_date = "2022-01-07") + ) + expect_silent(predict(wf1, latest)) + + wf1 <- add_frosting( + wf, + adjust_frosting(f, "layer_add_target_date", target_date = 2022L) + ) + expect_error(predict(wf1, latest)) # wrong time type of forecast_date + +}) + From 0c3872360ed0b26a122d9ce911dd61f6fcb52ecc Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 18 Mar 2024 11:40:53 -0700 Subject: [PATCH 15/25] this shouldnt be on this branch --- vignettes/panel-data.Rmd | 545 --------------------------------------- 1 file changed, 545 deletions(-) delete mode 100644 vignettes/panel-data.Rmd diff --git a/vignettes/panel-data.Rmd b/vignettes/panel-data.Rmd deleted file mode 100644 index 39321ae4c..000000000 --- a/vignettes/panel-data.Rmd +++ /dev/null @@ -1,545 +0,0 @@ ---- -title: "Using epipredict on non-epidemic panel data" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Using epipredict on non-epidemic panel data} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r setup, include=F} -knitr::opts_chunk$set( - echo = TRUE, - collapse = TRUE, - comment = "#>", - out.width = "100%" -) -``` - -```{r libraries} -library(dplyr) -library(tidyr) -library(parsnip) -library(recipes) -library(epiprocess) -library(epipredict) -library(ggplot2) -theme_set(theme_bw()) -``` - -[Panel data](https://en.wikipedia.org/wiki/Panel_data), or longitudinal data, -contain cross-sectional measurements of subjects over time. The `epipredict` -package is most suitable for running forecasters on epidemiological panel data. -A built-in example of this is the [`case_death_rate_subset`]( - https://cmu-delphi.github.io/epipredict/reference/case_death_rate_subset.html) -dataset, which contains daily state-wise measures of `case_rate` and -`death_rate` for COVID-19 in 2021: - -```{r epi-panel-ex, include=T} -head(case_death_rate_subset, 3) -``` - -`epipredict` functions work with data in -[`epi_df`](https://cmu-delphi.github.io/epiprocess/reference/epi_df.html) -format. Despite the stated goal and name of the package, other panel datasets -are also valid candidates for `epipredict` functionality, as long as they are -in `epi_df` format. - -```{r employ-stats, include=F} -data("grad_employ_subset") -year_start <- min(grad_employ_subset$time_value) -year_end <- max(grad_employ_subset$time_value) -``` - -# Example panel data overview - -In this vignette, we will demonstrate using `epipredict` with employment panel -data from Statistics Canada. We will be using -[ - Table 37-10-0115-01: Characteristics and median employment income of - longitudinal cohorts of postsecondary graduates two and five years after - graduation, by educational qualification and field of study (primary - groupings) -](https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=3710011501). - -The full dataset contains yearly median employment income two and five years -after graduation, and number of graduates. The data is stratified by -variables such as geographic region (Canadian province), education, and -age group. The year range of the dataset is `r year_start` to `r year_end`, -inclusive. The full dataset also contains metadata that describes the -quality of data collected. For demonstration purposes, we make the following -modifications to get a subset of the full dataset: - -* Only keep provincial-level geographic region (the full data also has -"Canada" as a region) -* Only keep "good" or better quality data rows, as indicated by the [`STATUS`]( - https://www.statcan.gc.ca/en/concepts/definitions/guide-symbol) column -* Choose a subset of covariates and aggregate across the remaining ones. The -chosen covariates are age group, and educational qualification. - -Below is the query for obtaining the full data and code for subsetting it as we -just described: - -```{r employ-query, eval=F} -library(cansim) - -# Get statcan data using get_cansim, which returns a tibble -statcan_grad_employ <- get_cansim("37-10-0115-01") - -gemploy <- statcan_grad_employ %>% - # Drop some columns and rename the ones we keep - select(c( - "REF_DATE", "GEO", "VALUE", "STATUS", "Educational qualification", - "Field of study", "Gender", "Age group", "Status of student in Canada", - "Characteristics after graduation", "Graduate statistics" - )) %>% - rename( - "geo_value" = "GEO", - "time_value" = "REF_DATE", - "value" = "VALUE", - "status" = "STATUS", - "edu_qual" = "Educational qualification", - "fos" = "Field of study", - "gender" = "Gender", - "age_group" = "Age group", - "student_status" = "Status of student in Canada", - "grad_charac" = "Characteristics after graduation", - "grad_stat" = "Graduate statistics" - ) %>% - # The original `VALUE` column contain the statistic indicated by - # `Graduate statistics` in the original data. Below we pivot the data - # wider so that each unique statistic can have its own column. - mutate( - # Recode for easier pivoting - grad_stat = recode_factor( - grad_stat, - `Number of graduates` = "num_graduates", - `Median employment income two years after graduation` = "med_income_2y", - `Median employment income five years after graduation` = "med_income_5y" - ), - # They are originally strings but want ints for conversion to epi_df later - time_value = as.integer(time_value) - ) %>% - pivot_wider(names_from = grad_stat, values_from = value) %>% - filter( - # Drop aggregates for some columns - geo_value != "Canada" & - age_group != "15 to 64 years" & - edu_qual != "Total, educational qualification" & - # Keep aggregates for keys we don't want to keep - fos == "Total, field of study" & - gender == "Total, gender" & - student_status == "Canadian and international students" & - # Since we're looking at 2y and 5y employment income, the only - # characteristics remaining are: - # - Graduates reporting employment income - # - Graduates reporting wages, salaries, and commissions only - # For simplicity, keep the first one only - grad_charac == "Graduates reporting employment income" & - # Only keep "good" data - is.na(status) & - # Drop NA value rows - !is.na(num_graduates) & !is.na(med_income_2y) & !is.na(med_income_5y) - ) %>% - select(-c(status, gender, student_status, grad_charac, fos)) -``` - -To use this data with `epipredict`, we need to convert it into `epi_df` format -using [`as_epi_df`]( - https://cmu-delphi.github.io/epiprocess/reference/as_epi_df.html) -with additional keys. In our case, the additional keys are `age_group`, -and `edu_qual`. Note that in the above modifications, we encoded `time_value` -as type `integer`. This lets us set `time_type = "year"`, and ensures that -lag and ahead modifications later on are using the correct time units. See the -[`epi_df` documentation]( - https://cmu-delphi.github.io/epiprocess/reference/epi_df.html#time-types) for -a list of all the `type_type`s available. - -```{r convert-to-epidf, eval=F} -grad_employ_subset <- gemploy %>% - tsibble::as_tsibble( - index = time_value, - key = c(geo_value, age_group, edu_qual) - ) %>% - as_epi_df( - geo_type = "custom", time_type = "year", - additional_metadata = c(other_keys = list("age_group", "edu_qual")) - ) -``` - -```{r data-dim, include=F} -employ_rowcount <- format(nrow(grad_employ_subset), big.mark = ",") -employ_colcount <- length(names(grad_employ_subset)) -``` - -Now, we are ready to use `grad_employ_subset` with `epipredict`. -Our `epi_df` contains `r employ_rowcount` rows and `r employ_colcount` columns. -Here is a quick summary of the columns in our `epi_df`: - -* `time_value` (time value): year in `date` format -* `geo_value` (geo value): province in Canada -* `num_graduates` (raw, time series value): number of graduates -* `med_income_2y` (raw, time series value): median employment income 2 years -after graduation -* `med_income_5y` (raw, time series value): median employment income 5 years -after graduation -* `age_group` (key): one of two age groups, either 15 to 34 years, or 35 to 64 -years -* `edu_qual` (key): one of 32 unique educational qualifications, e.g., -"Master's diploma" - -```{r preview-data, include=T} -# Rename for simplicity -employ <- grad_employ_subset -sample_n(employ, 6) -``` - -In the following sections, we will go over preprocessing the data in the -`epi_recipe` framework, and fitting a model and making predictions within the -`epipredict` framework and using the package's canned forecasters. - -# Simple autoregressive with 3 lags to predict number of graduates in a year - -## Preprocessing - -As a simple example, let's work with the `num_graduates` column for now. We will -first pre-process by "standardizing" each numeric column by the total within -each group of keys. We do this since those raw numeric values will vary greatly -from province to province since there are large differences in population. - -```{r employ-small, include=T} -employ_small <- employ %>% - group_by(geo_value, age_group, edu_qual) %>% - # Select groups where there are complete timeseries values - filter(n() >= 6) %>% - mutate( - num_graduates_prop = num_graduates / sum(num_graduates), - med_income_2y_prop = med_income_2y / sum(med_income_2y), - med_income_5y_prop = med_income_5y / sum(med_income_5y) - ) %>% - ungroup() -head(employ_small) -``` - -Below is a visualization for a sample of the small data. Note that some groups -do not have any time series information since we filtered out all timeseries -with incomplete dates. - -```{r employ-small-graph, include=T, eval=T, fig.width=9, fig.height=6} -employ_small %>% - filter(geo_value %in% c("British Columbia", "Ontario")) %>% - filter(grepl("degree", edu_qual, fixed = T)) %>% - group_by(geo_value, time_value, edu_qual, age_group) %>% - summarise(num_graduates_prop = sum(num_graduates_prop), .groups = "drop") %>% - ggplot(aes(x = time_value, y = num_graduates_prop, color = geo_value)) + - geom_line() + - scale_colour_manual(values = c("Cornflowerblue", "Orange"), name = "") + - facet_grid(rows = vars(edu_qual), cols = vars(age_group)) + - xlab("Year") + - ylab("Percentage of gratuates") + - theme(legend.position = "bottom") -``` - -We will predict the "standardized" number of graduates (a proportion) in the -next year (time $t+1$) using an autoregressive model with three lags (i.e., an -AR(3) model). Such a model is represented algebraically like this: - -\[ - x_{t+1} = - \alpha_0 + \alpha_1 x_{t} + \alpha_2 x_{t-1} + \alpha_3 x_{t-2} + \epsilon_t -\] - -where $x_i$ is the proportion of graduates at time $i$, and the current time is -$t$. - -In the preprocessing step, we need to create additional columns in `employ` for -each of $x_{t+1}$, $x_{t}$, $x_{t-1}$, and $x_{t-2}$. We do this via an -`epi_recipe`. Note that creating an `epi_recipe` alone doesn't add these -outcome and predictor columns; the recipe just stores the instructions for -adding them. - -Our `epi_recipe` should add one `ahead` column representing $x_{t+1}$ and -3 `lag` columns representing $x_{t}$, $x_{t-1}$, and $x_{t-2}$. Also note that -since we specified our `time_type` to be `year`, our `lag` and `lead` -values are both in years. - -```{r make-recipe, include=T, eval=T} -r <- epi_recipe(employ_small) %>% - step_epi_ahead(num_graduates_prop, ahead = 1) %>% - step_epi_lag(num_graduates_prop, lag = 0:2) %>% - step_epi_naomit() -r -``` - -Let's apply this recipe using `prep` and `bake` to generate and view the `lag` -and `ahead` columns. - -```{r view-preprocessed, include=T} -# Display a sample of the preprocessed data -bake_and_show_sample <- function(recipe, data, n = 5) { - recipe %>% prep(data) %>% bake(new_data = data) %>% sample_n(n) -} - -r %>% bake_and_show_sample(employ_small) -``` - -We can see that the `prep` and `bake` steps created new columns according to -our `epi_recipe`: - -- `ahead_1_num_graduates_prop` corresponds to $x_{t+1}$ -- `lag_0_num_graduates_prop`, `lag_1_num_graduates_prop`, and -`lag_2_num_graduates_prop` correspond to $x_{t}$, $x_{t-1}$, and $x_{t-2}$ -respectively. - -## Model fitting and prediction - -Since our goal for now is to fit a simple autoregressive model, we can use -[`parsnip::linear_reg()`]( - https://parsnip.tidymodels.org/reference/linear_reg.html) with the default -engine `lm`, which fits a linear regression using ordinary least squares. - -We will use `epi_workflow` with the `epi_recipe` we defined in the -preprocessing section along with the `parsnip::linear_reg()` model. Note again -that `epi_workflow` is a container and doesn't actually do the fitting. We have -to pass the workflow into `fit()` to get our model coefficients -$\alpha_i, i=0,...,3$. - -```{r linearreg-wf, include=T} -wf_linreg <- epi_workflow(r, linear_reg()) %>% - fit(employ_small) -summary(extract_fit_engine(wf_linreg)) -``` - -This output tells us the coefficients of the fitted model; for instance, -the intercept is $\alpha_0 = 0.24804$ and the coefficient for $x_{t}$ is -$\alpha_1 = 0.06648$. The summary also tells us that only the intercept and lags -at 2 years and 3 years ago have coefficients significantly greater than zero. - -Extracting the 95% confidence intervals for the coefficients also leads us to -the same conclusion: all the coefficients except for $\alpha_1$ (lag 0) contain -0. - -```{r} -confint(extract_fit_engine(wf_linreg)) -``` - -Now that we have our workflow, we can generate predictions from a subset of our -data. For this demo, we will predict the number of graduates using the last 2 -years of our dataset. - -```{r linearreg-predict, include=T} -latest <- get_test_data(recipe = r, x = employ_small) -preds <- stats::predict(wf_linreg, latest) %>% filter(!is.na(.pred)) -# Display a sample of the prediction values, excluding NAs -preds %>% sample_n(5) -``` - -We can do this using the `augment` function too. Note that `predict` and -`augment` both still return an `epi_df` with all of the keys that were present -in the original dataset. - -```{r linearreg-augment, include=T, eval=F} -augment(wf_linreg, latest) %>% sample_n(5) -``` - -## Model diagnostics - -First, we'll plot the residuals (that is, $y_{t} - \hat{y}_{t}$) against the -fitted values ($\hat{y}_{t}$). - -```{r lienarreg-resid-plot, include=T, fig.height=8} -par(mfrow = c(2, 2), mar = c(5, 3.5, 1, 1) + .5) -plot(extract_fit_engine(wf_linreg)) -``` - -The fitted values vs. residuals plot shows us that the residuals are mostly -clustered around zero, but do not form an even band around the zero line, -indicating that the variance of the residuals is not constant. Additionally, -the fitted values vs. square root of standardized residuals makes this more -obvious - the spread of the square root of standardized residuals varies with -the fitted values. - -The Q-Q plot shows us that the residuals have heavier tails than a Normal -distribution. So the normality of residuals assumption doesn't hold either. - -Finally, the residuals vs. leverage plot shows us that we have a few influential -points based on the Cooks distance (those outside the red dotted line). - -Since we appear to be violating the linear model assumptions, we might consider -transforming our data differently, or considering a non-linear model, or -something else. - -# Autoregressive model with exogenous inputs - -Now suppose we want to model the 5-year employment income using 3 lags, while -also incorporating information from the other two time-series in our dataset: -the 2-year employment income and the number of graduates in the previous 2 -years. We would do this using an autoregressive model with exogenous inputs, -defined as follows: - -\[ - y_{t+1} = - \alpha_0 + \alpha_1 y_{t} + \alpha_2 y_{t-1} + \alpha_3 y_{t-2} + - \beta_1 x_{t} + \beta_2 x_{t-1} - \gamma_2 z_{t} + \gamma_2 z_{t-1} -\] - -where $y_i$ is the 5-year median income (proportion) at time $i$, -$x_i$ is the 2-year median income (proportion) at time $i$, and -$z_i$ is the number of graduates (proportion) at time $i$. - -## Preprocessing - -Again, we construct an `epi_recipe` detailing the preprocessing steps. - -```{r custom-arx, include=T} -rx <- epi_recipe(employ_small) %>% - step_epi_ahead(med_income_5y_prop, ahead = 1) %>% - # 5-year median income has 3 lags c(0,1,2) - step_epi_lag(med_income_5y_prop, lag = c(0, 1, 2)) %>% - # But the two exogenous variables have 2 lags c(0,1) - step_epi_lag(med_income_2y_prop, lag = c(0, 1)) %>% - step_epi_lag(num_graduates_prop, lag = c(0, 1)) %>% - step_epi_naomit() - -bake_and_show_sample(rx, employ_small) -``` - -## Model fitting & postprocessing - -Before fitting our model and making predictions, let's add some post-processing -steps using a few [`frosting`]( - https://cmu-delphi.github.io/epipredict/reference/frosting.html) layers to do -a few things: - -1. Convert our predictions back to income values and number of graduates, -rather than standardized proportions. We do this via the frosting layer -`layer_population_scaling`. -2. Threshold our predictions to 0. We are predicting proportions, which can't -be negative. And the transformed values back to dollars and people can't be -negative either. -3. Generate prediction intervals based on residual quantiles, allowing us to -quantify the uncertainty associated with future predicted values. - -```{r custom-arx-post, include=T} -# Create dataframe of the sums we used for standardizing -# Only have to include med_income_5y since that is our outcome -totals <- employ_small %>% - group_by(geo_value, age_group, edu_qual) %>% - summarise(med_income_5y_tot = sum(med_income_5y), .groups = "drop") - -# Define post-processing steps -f <- frosting() %>% - layer_predict() %>% - layer_naomit(.pred) %>% - layer_threshold(.pred, lower = 0) %>% - # 90% prediction interval - layer_residual_quantiles( - quantile_levels = c(0.1, 0.9), - symmetrize = FALSE - ) %>% - layer_population_scaling( - .pred, .pred_distn, - df = totals, df_pop_col = "med_income_5y_tot" - ) - -wfx_linreg <- epi_workflow(rx, parsnip::linear_reg()) %>% - fit(employ_small) %>% - add_frosting(f) - -summary(extract_fit_engine(wfx_linreg)) -``` - -Based on the summary output for this model, only the intercept term, 5-year -median income from 3 years ago, and the 2-year median income from 1 year ago -are significant linear predictors for today's 5-year median income at a 95% -confidence level. Both lags for the number of graduates were insignificant. - -Let's take a look at the predictions along with their 90% prediction intervals. - -```{r} -latest <- get_test_data(recipe = rx, x = employ_small) -predsx <- predict(wfx_linreg, latest) - -# Display values within prediction intervals -predsx %>% - select( - geo_value, time_value, edu_qual, age_group, fos, - .pred_scaled, .pred_distn_scaled - ) %>% - head() %>% - pivot_quantiles_wider(.pred_distn_scaled) -``` - -# Using canned forecasters - -We've seen what we can do with non-epidemiological panel data using the -recipes frame, with `epi_recipe` for preprocessing, `epi_workflow` for model -fitting, and `frosting` for postprocessing. - -`epipredict` also comes with canned forecasters that do all of those steps -behind the scenes for some simple models. Even though we aren't working with -epidemiological data, canned forecasters still work as expected, out of the box. -We will demonstrate this with the simple -[`flatline_forecaster`]( - https://cmu-delphi.github.io/epipredict/reference/flatline_forecaster.html) -and the direct autoregressive (AR) forecaster -[`arx_forecaster`]( - https://cmu-delphi.github.io/epipredict/reference/arx_forecaster.html). - -For both illustrations, we will continue to use the `employ_small` dataset -with the transformed numeric columns that are proportions within each group -by the keys in our `epi_df`. - -## Flatline forecaster - -In this first example, we'll use `flatline_forecaster` to make a simple -prediction of the 2-year median income for the next year, based on one previous -time point. This model is representated algebraically as: -\[y_{t+1} = \alpha_0 + \alpha_0 y_{t}\] -where $y_i$ is the 2-year median income (proportion) at time $i$. - -```{r flatline, include=T, warning=F} -out_fl <- flatline_forecaster(employ_small, "med_income_2y_prop", - args_list = flatline_args_list(ahead = 1) -) - -out_fl -``` - -## Autoregressive forecaster with exogenous inputs - -In this second example, we'll use `arx_forecaster` to make a prediction of the -5-year median income based using two lags, _and_ using two lags on two exogenous -variables: 2-year median income and number of graduates. - -The canned forecaster gives us a simple way of making this forecast since it -defines the recipe, workflow, and post-processing steps behind the scenes. This -is very similar to the model we introduced in the "Autoregressive Linear Model -with Exogenous Inputs" section of this article, but where all inputs have the -same number of lags. - -```{r arx-lr, include=T, warning=F} -arx_args <- arx_args_list(lags = c(0L, 1L), ahead = 1L) - -out_arx_lr <- arx_forecaster(employ_small, "med_income_5y_prop", - c("med_income_5y_prop", "med_income_2y_prop", "num_graduates_prop"), - args_list = arx_args -) - -out_arx_lr -``` - -Other changes to the direct AR forecaster, like changing the engine, also work -as expected. Below we use a boosted tree model instead of a linear regression. - -```{r arx-rf, include=T, warning=F} -out_arx_rf <- arx_forecaster( - employ_small, "med_income_5y_prop", - c("med_income_5y_prop", "med_income_2y_prop", "num_graduates_prop"), - trainer = parsnip::boost_tree(mode = "regression", trees = 20), - args_list = arx_args -) - -out_arx_rf -``` From e036b8d5d6713b9522607c96ce62d05a276ae83a Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 18 Mar 2024 11:44:39 -0700 Subject: [PATCH 16/25] abort message typo --- R/layer_add_forecast_date.R | 2 +- R/layer_add_target_date.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/layer_add_forecast_date.R b/R/layer_add_forecast_date.R index ea3635d79..31196f51c 100644 --- a/R/layer_add_forecast_date.R +++ b/R/layer_add_forecast_date.R @@ -104,7 +104,7 @@ slather.layer_add_forecast_date <- function(object, components, workflow, new_da cli::cli_abort(c( "The `forecast_date` was given as a {.val {check$x}} while the", `!` = "`time_type` of the training data was {.val {check$expected}}.", - i = "See {.topic epiprocess::epi_df} for descriptions of these are determined." + i = "See {.topic epiprocess::epi_df} for how these are determined." )) } diff --git a/R/layer_add_target_date.R b/R/layer_add_target_date.R index 8ab077efa..6a9affc71 100644 --- a/R/layer_add_target_date.R +++ b/R/layer_add_target_date.R @@ -110,7 +110,7 @@ slather.layer_add_target_date <- function(object, components, workflow, new_data cli::cli_abort(c( "The `forecast_date` was given as a {.val {check$x}} while the", `!` = "`time_type` of the training data was {.val {check$expected}}.", - i = "See {.topic epiprocess::epi_df} for descriptions of these are determined." + i = "See {.topic epiprocess::epi_df} for how these are determined." )) } forecast_date <- coerce_time_type(possible_fd, expected_time_type) From e6e60cd7b5305bcc3af65e0dc6501234cb1225b1 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 18 Mar 2024 11:49:36 -0700 Subject: [PATCH 17/25] bump version, add to news --- DESCRIPTION | 2 +- NEWS.md | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 68a566cee..9329382fd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: epipredict Title: Basic epidemiology forecasting methods -Version: 0.0.10 +Version: 0.0.11 Authors@R: c( person("Daniel", "McDonald", , "daniel@stat.ubc.ca", role = c("aut", "cre")), person("Ryan", "Tibshirani", , "ryantibs@cmu.edu", role = "aut"), diff --git a/NEWS.md b/NEWS.md index 04dc78e4f..5f629f45f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -33,3 +33,5 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicat - Working vignette - use `checkmate` for input validation - refactor quantile extrapolation (possibly creates different results) +- force `target_date` + `forecast_date` handling to match the time_type of + the epi_df. allows for annual and weekly data From 845c1a9ecd7162e727c8c9cf4ee11754afaba345 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 18 Mar 2024 12:02:17 -0700 Subject: [PATCH 18/25] run styler --- R/epi_workflow.R | 2 +- R/layer_add_forecast_date.R | 2 +- R/layer_add_target_date.R | 23 ++++----- R/step_epi_shift.R | 5 +- R/time_types.R | 50 +++++++++++++------ tests/testthat/test-layer_add_forecast_date.R | 1 - tests/testthat/test-layer_add_target_date.R | 2 - 7 files changed, 49 insertions(+), 36 deletions(-) diff --git a/R/epi_workflow.R b/R/epi_workflow.R index 0fca5dc6c..e64e0f7bc 100644 --- a/R/epi_workflow.R +++ b/R/epi_workflow.R @@ -253,7 +253,7 @@ predict.epi_workflow <- function(object, new_data, ...) { if (!workflows::is_trained_workflow(object)) { cli::cli_abort(c( "Can't predict on an untrained epi_workflow.", - i = "Do you need to call `fit()`?" + i = "Do you need to call `fit()`?" )) } components <- list() diff --git a/R/layer_add_forecast_date.R b/R/layer_add_forecast_date.R index 31196f51c..c02fc9173 100644 --- a/R/layer_add_forecast_date.R +++ b/R/layer_add_forecast_date.R @@ -97,7 +97,7 @@ slather.layer_add_forecast_date <- function(object, components, workflow, new_da expected_time_type <- attr( workflows::extract_preprocessor(workflow)$template, "metadata" )$time_type - if (expected_time_type == "week") expected_time_type = "day" + if (expected_time_type == "week") expected_time_type <- "day" check <- validate_date(object$forecast_date, expected_time_type) if (!check$ok) { diff --git a/R/layer_add_target_date.R b/R/layer_add_target_date.R index 6a9affc71..9034398ff 100644 --- a/R/layer_add_target_date.R +++ b/R/layer_add_target_date.R @@ -87,7 +87,7 @@ slather.layer_add_target_date <- function(object, components, workflow, new_data expected_time_type <- attr( workflows::extract_preprocessor(workflow)$template, "metadata" )$time_type - if (expected_time_type == "week") expected_time_type = "day" + if (expected_time_type == "week") expected_time_type <- "day" if (!is.null(object$target_date)) { check <- validate_date(object$target_date, expected_time_type) @@ -101,10 +101,9 @@ slather.layer_add_target_date <- function(object, components, workflow, new_data target_date <- coerce_time_type(object$target_date, expected_time_type) } else if ( detect_layer(the_frosting, "layer_add_forecast_date") && - !is.null(possible_fd <- extract_argument( - the_frosting, "layer_add_forecast_date", "forecast_date" - ))) { - + !is.null(possible_fd <- extract_argument( + the_frosting, "layer_add_forecast_date", "forecast_date" + ))) { check <- validate_date(possible_fd, expected_time_type) if (!check$ok) { cli::cli_abort(c( @@ -117,13 +116,13 @@ slather.layer_add_target_date <- function(object, components, workflow, new_data ahead <- extract_argument(the_recipe, "step_epi_ahead", "ahead") target_date <- forecast_date + ahead } else { - max_time_value <- max( - workflows::extract_preprocessor(workflow)$max_time_value, - workflow$fit$meta$max_time_value, - max(new_data$time_value) - ) - ahead <- extract_argument(the_recipe, "step_epi_ahead", "ahead") - target_date <- max_time_value + ahead + max_time_value <- max( + workflows::extract_preprocessor(workflow)$max_time_value, + workflow$fit$meta$max_time_value, + max(new_data$time_value) + ) + ahead <- extract_argument(the_recipe, "step_epi_ahead", "ahead") + target_date <- max_time_value + ahead } components$predictions <- dplyr::bind_cols(components$predictions, diff --git a/R/step_epi_shift.R b/R/step_epi_shift.R index 94ef7178c..52f51de16 100644 --- a/R/step_epi_shift.R +++ b/R/step_epi_shift.R @@ -120,9 +120,8 @@ step_epi_ahead <- if (missing(ahead)) { cli::cli_abort(c( "The `ahead` argument must not be empty.", - i = "Did you perhaps pass an integer in `...` accidentally?" - ) - ) + i = "Did you perhaps pass an integer in `...` accidentally?" + )) } arg_is_nonneg_int(ahead) arg_is_chr_scalar(prefix, id) diff --git a/R/time_types.R b/R/time_types.R index 5f3aa4c14..0d1fa0662 100644 --- a/R/time_types.R +++ b/R/time_types.R @@ -3,23 +3,39 @@ guess_time_type <- function(time_value) { arg_is_scalar(time_value) if (is.character(time_value)) { if (nchar(time_value) <= "10") { - new_time_value <- tryCatch({ - as.Date(time_value) - }, error = function(e) NULL) + new_time_value <- tryCatch( + { + as.Date(time_value) + }, + error = function(e) NULL + ) } else { - new_time_value <- tryCatch({ - as.POSIXct(time_value) - }, error = function(e) NULL) + new_time_value <- tryCatch( + { + as.POSIXct(time_value) + }, + error = function(e) NULL + ) } if (!is.null(new_time_value)) time_value <- new_time_value } - if (inherits(time_value, "POSIXct")) return("day-time") - if (inherits(time_value, "Date")) return("day") - if (inherits(time_value, "yearweek")) return("yearweek") - if (inherits(time_value, "yearmonth")) return("yearmonth") - if (inherits(time_value, "yearquarter")) return("yearquarter") + if (inherits(time_value, "POSIXct")) { + return("day-time") + } + if (inherits(time_value, "Date")) { + return("day") + } + if (inherits(time_value, "yearweek")) { + return("yearweek") + } + if (inherits(time_value, "yearmonth")) { + return("yearmonth") + } + if (inherits(time_value, "yearquarter")) { + return("yearquarter") + } if (is.numeric(time_value) && all(time_value == as.integer(time_value)) && - all(time_value >= 1582)) { + all(time_value >= 1582)) { return("year") } return("custom") @@ -27,11 +43,13 @@ guess_time_type <- function(time_value) { coerce_time_type <- function(x, target_type) { if (target_type == "year") { - if (is.numeric(x)) return(as.integer(x)) - else return(as.POSIXlt(x)$year + 1900L) + if (is.numeric(x)) { + return(as.integer(x)) + } else { + return(as.POSIXlt(x)$year + 1900L) + } } - switch( - target_type, + switch(target_type, "day-time" = as.POSIXct(x), "day" = as.Date(x), "week" = as.Date(x), diff --git a/tests/testthat/test-layer_add_forecast_date.R b/tests/testthat/test-layer_add_forecast_date.R index 93f13baf7..698924af3 100644 --- a/tests/testthat/test-layer_add_forecast_date.R +++ b/tests/testthat/test-layer_add_forecast_date.R @@ -104,5 +104,4 @@ test_that("forecast date works for daily", { adjust_frosting(f, "layer_add_forecast_date", forecast_date = 2022L) ) expect_error(predict(wf1, latest)) # wrong time type of forecast_date - }) diff --git a/tests/testthat/test-layer_add_target_date.R b/tests/testthat/test-layer_add_target_date.R index f3b706aa6..6316ec991 100644 --- a/tests/testthat/test-layer_add_target_date.R +++ b/tests/testthat/test-layer_add_target_date.R @@ -115,6 +115,4 @@ test_that("forecast date works for daily", { adjust_frosting(f, "layer_add_target_date", target_date = 2022L) ) expect_error(predict(wf1, latest)) # wrong time type of forecast_date - }) - From 8feef06ded15745a3d0ae0be73841eaf8acc3eee Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 27 Mar 2024 11:12:23 -0700 Subject: [PATCH 19/25] Apply @dsweber2 suggestions from code review Co-authored-by: David Weber --- tests/testthat/test-layer_add_forecast_date.R | 16 ++++++++++------ tests/testthat/test-layer_add_target_date.R | 2 +- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/testthat/test-layer_add_forecast_date.R b/tests/testthat/test-layer_add_forecast_date.R index 698924af3..809644b73 100644 --- a/tests/testthat/test-layer_add_forecast_date.R +++ b/tests/testthat/test-layer_add_forecast_date.R @@ -84,24 +84,28 @@ test_that("forecast date works for daily", { wf1 <- add_frosting(wf, f) p <- predict(wf1, latest) + # both forecast_date and epi_df are dates expect_identical(p$forecast_date[1], as.Date("2021-12-31")) - latest_bad <- latest %>% + # forecast_date is a date while the epi_df uses years (ints) + latest_yearly <- latest %>% unclass() %>% as.data.frame() %>% mutate(time_value = as.POSIXlt(time_value)$year + 1900L) %>% as_epi_df() + expect_error(predict(wf1, latest_yearly)) - expect_error(predict(wf1, latest_bad)) - wf1 <- add_frosting( + # forecast_date is a string + wf2 <- add_frosting( wf, adjust_frosting(f, "layer_add_forecast_date", forecast_date = "2022-01-01") ) - expect_silent(predict(wf1, latest)) + expect_silent(predict(wf2, latest)) - wf1 <- add_frosting( + # forecast_date is a year/int while the epi_df is a date + wf3 <- add_frosting( wf, adjust_frosting(f, "layer_add_forecast_date", forecast_date = 2022L) ) - expect_error(predict(wf1, latest)) # wrong time type of forecast_date + expect_error(predict(wf3, latest)) }) diff --git a/tests/testthat/test-layer_add_target_date.R b/tests/testthat/test-layer_add_target_date.R index 6316ec991..999c07367 100644 --- a/tests/testthat/test-layer_add_target_date.R +++ b/tests/testthat/test-layer_add_target_date.R @@ -87,7 +87,7 @@ test_that("Specify own target date", { expect_named(p2, c("geo_value", "time_value", ".pred", "target_date")) }) -test_that("forecast date works for daily", { +test_that("forecast date works for daily and yearly", { f <- frosting() %>% layer_predict() %>% layer_add_target_date() %>% From 6d5c3791956db72c3458c5c06576a97eddc90495 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Wed, 27 Mar 2024 11:13:28 -0700 Subject: [PATCH 20/25] add back omitted test, include some comments --- tests/testthat/test-layer_add_forecast_date.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/testthat/test-layer_add_forecast_date.R b/tests/testthat/test-layer_add_forecast_date.R index 698924af3..d452990e9 100644 --- a/tests/testthat/test-layer_add_forecast_date.R +++ b/tests/testthat/test-layer_add_forecast_date.R @@ -11,9 +11,9 @@ latest <- jhu %>% test_that("layer validation works", { f <- frosting() - - - expect_error(layer_add_forecast_date(f, "2022-05-31", id = 2)) + expect_error(layer_add_forecast_date(f, c("2022-05-31", "2022-05-31"))) # multiple forecast_dates + expect_error(layer_add_forecast_date(f, "2022-05-31", id = 2)) # id is not a character + expect_error(layer_add_forecast_date(f, "2022-05-31", id = c("a", "b"))) # multiple ids expect_silent(layer_add_forecast_date(f, "2022-05-31")) expect_silent(layer_add_forecast_date(f)) expect_silent(layer_add_forecast_date(f, as.Date("2022-05-31"))) From 047caa685e8a1038bc93afa178ff323207dd48ef Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Wed, 27 Mar 2024 11:13:45 -0700 Subject: [PATCH 21/25] validate scalar forecast_date --- R/layer_add_forecast_date.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/layer_add_forecast_date.R b/R/layer_add_forecast_date.R index c02fc9173..376194712 100644 --- a/R/layer_add_forecast_date.R +++ b/R/layer_add_forecast_date.R @@ -79,6 +79,7 @@ layer_add_forecast_date <- layer_add_forecast_date_new <- function(forecast_date, id) { arg_is_chr_scalar(id) + arg_is_scalar(forecast_date, allow_null = TRUE) # can't validate forecast_date until we know the time_type layer("add_forecast_date", forecast_date = forecast_date, id = id) } From fc05cd93d9a32a6060ed02a301188364c453e165 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Wed, 27 Mar 2024 11:15:34 -0700 Subject: [PATCH 22/25] symmetrize scalar validation --- R/layer_add_forecast_date.R | 7 ++++--- R/layer_add_target_date.R | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/R/layer_add_forecast_date.R b/R/layer_add_forecast_date.R index 376194712..18d57337f 100644 --- a/R/layer_add_forecast_date.R +++ b/R/layer_add_forecast_date.R @@ -68,6 +68,9 @@ #' p3 layer_add_forecast_date <- function(frosting, forecast_date = NULL, id = rand_id("add_forecast_date")) { + arg_is_chr_scalar(id) + arg_is_scalar(forecast_date, allow_null = TRUE) + # can't validate the type of forecast_date until we know the time_type add_layer( frosting, layer_add_forecast_date_new( @@ -78,9 +81,7 @@ layer_add_forecast_date <- } layer_add_forecast_date_new <- function(forecast_date, id) { - arg_is_chr_scalar(id) - arg_is_scalar(forecast_date, allow_null = TRUE) - # can't validate forecast_date until we know the time_type + layer("add_forecast_date", forecast_date = forecast_date, id = id) } diff --git a/R/layer_add_target_date.R b/R/layer_add_target_date.R index 9034398ff..217a06069 100644 --- a/R/layer_add_target_date.R +++ b/R/layer_add_target_date.R @@ -63,8 +63,9 @@ #' p3 layer_add_target_date <- function(frosting, target_date = NULL, id = rand_id("add_target_date")) { - # can't validate target_date until we know the time_type arg_is_chr_scalar(id) + arg_is_scalar(target_date, allow_null = TRUE) + # can't validate the type of target_date until we know the time_type add_layer( frosting, layer_add_target_date_new( From 1e76059401b996234fad022115b8f433967ff352 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Wed, 27 Mar 2024 11:18:47 -0700 Subject: [PATCH 23/25] add uncommitted stuff from code review --- tests/testthat/test-layer_add_target_date.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-layer_add_target_date.R b/tests/testthat/test-layer_add_target_date.R index 999c07367..49049da7e 100644 --- a/tests/testthat/test-layer_add_target_date.R +++ b/tests/testthat/test-layer_add_target_date.R @@ -87,7 +87,7 @@ test_that("Specify own target date", { expect_named(p2, c("geo_value", "time_value", ".pred", "target_date")) }) -test_that("forecast date works for daily and yearly", { +test_that("target date works for daily and yearly", { f <- frosting() %>% layer_predict() %>% layer_add_target_date() %>% @@ -95,21 +95,25 @@ test_that("forecast date works for daily and yearly", { wf1 <- add_frosting(wf, f) p <- predict(wf1, latest) + # both target_date and epi_df are dates expect_identical(p$target_date[1], as.Date("2021-12-31") + 7L) + # target_date is a date while the epi_df uses years (ints) latest_bad <- latest %>% unclass() %>% as.data.frame() %>% mutate(time_value = as.POSIXlt(time_value)$year + 1900L) %>% as_epi_df() - expect_error(predict(wf1, latest_bad)) + + # target_date is a string (gets correctly converted to Date) wf1 <- add_frosting( wf, adjust_frosting(f, "layer_add_target_date", target_date = "2022-01-07") ) expect_silent(predict(wf1, latest)) + # target_date is a year/int while the epi_df is a date wf1 <- add_frosting( wf, adjust_frosting(f, "layer_add_target_date", target_date = 2022L) From f5ef88ba2f08c6851971c295205180d0ca20c3ed Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Wed, 27 Mar 2024 13:37:49 -0700 Subject: [PATCH 24/25] clarify some test purposes --- tests/testthat/test-layer_add_forecast_date.R | 5 +++-- tests/testthat/test-layer_add_target_date.R | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/testthat/test-layer_add_forecast_date.R b/tests/testthat/test-layer_add_forecast_date.R index 1e04a3c11..9595b47b6 100644 --- a/tests/testthat/test-layer_add_forecast_date.R +++ b/tests/testthat/test-layer_add_forecast_date.R @@ -87,7 +87,8 @@ test_that("forecast date works for daily", { # both forecast_date and epi_df are dates expect_identical(p$forecast_date[1], as.Date("2021-12-31")) - # forecast_date is a date while the epi_df uses years (ints) + # the error happens at predict time because the + # time_value train/test types don't match latest_yearly <- latest %>% unclass() %>% as.data.frame() %>% @@ -95,7 +96,7 @@ test_that("forecast date works for daily", { as_epi_df() expect_error(predict(wf1, latest_yearly)) - # forecast_date is a string + # forecast_date is a string, gets correctly converted to date wf2 <- add_frosting( wf, adjust_frosting(f, "layer_add_forecast_date", forecast_date = "2022-01-01") diff --git a/tests/testthat/test-layer_add_target_date.R b/tests/testthat/test-layer_add_target_date.R index 49049da7e..e5349839b 100644 --- a/tests/testthat/test-layer_add_target_date.R +++ b/tests/testthat/test-layer_add_target_date.R @@ -98,7 +98,8 @@ test_that("target date works for daily and yearly", { # both target_date and epi_df are dates expect_identical(p$target_date[1], as.Date("2021-12-31") + 7L) - # target_date is a date while the epi_df uses years (ints) + # the error happens at predict time because the + # time_value train/test types don't match latest_bad <- latest %>% unclass() %>% as.data.frame() %>% From c1e3ff9c2a70bc018fe1232f6845270512aa8815 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Wed, 27 Mar 2024 13:38:21 -0700 Subject: [PATCH 25/25] refactor validate_date() to avoid duplication --- R/layer_add_forecast_date.R | 23 ++++++++++------------- R/layer_add_target_date.R | 30 +++++++++++------------------- R/time_types.R | 15 +++++++++++---- 3 files changed, 32 insertions(+), 36 deletions(-) diff --git a/R/layer_add_forecast_date.R b/R/layer_add_forecast_date.R index 18d57337f..5bd6b6918 100644 --- a/R/layer_add_forecast_date.R +++ b/R/layer_add_forecast_date.R @@ -81,7 +81,6 @@ layer_add_forecast_date <- } layer_add_forecast_date_new <- function(forecast_date, id) { - layer("add_forecast_date", forecast_date = forecast_date, id = id) } @@ -93,27 +92,25 @@ slather.layer_add_forecast_date <- function(object, components, workflow, new_da workflow$fit$meta$max_time_value, max(new_data$time_value) ) - object$forecast_date <- max_time_value + forecast_date <- max_time_value + } else { + forecast_date <- object$forecast_date } expected_time_type <- attr( workflows::extract_preprocessor(workflow)$template, "metadata" )$time_type if (expected_time_type == "week") expected_time_type <- "day" - check <- validate_date(object$forecast_date, expected_time_type) - - if (!check$ok) { - cli::cli_abort(c( - "The `forecast_date` was given as a {.val {check$x}} while the", - `!` = "`time_type` of the training data was {.val {check$expected}}.", - i = "See {.topic epiprocess::epi_df} for how these are determined." - )) - } - + validate_date(forecast_date, expected_time_type, + call = expr(layer_add_forecast_date()) + ) + forecast_date <- coerce_time_type(forecast_date, expected_time_type) + object$forecast_date <- forecast_date components$predictions <- dplyr::bind_cols( components$predictions, - forecast_date = coerce_time_type(object$forecast_date, expected_time_type) + forecast_date = forecast_date ) + components } diff --git a/R/layer_add_target_date.R b/R/layer_add_target_date.R index 217a06069..a50b6042c 100644 --- a/R/layer_add_target_date.R +++ b/R/layer_add_target_date.R @@ -91,29 +91,20 @@ slather.layer_add_target_date <- function(object, components, workflow, new_data if (expected_time_type == "week") expected_time_type <- "day" if (!is.null(object$target_date)) { - check <- validate_date(object$target_date, expected_time_type) - if (!check$ok) { - cli::cli_abort(c( - "The `target_date` was given as a {.val {check$x}} while the", - `!` = "`time_type` of the training data was {.val {check$expected}}.", - i = "See {.topic epiprocess::epi_df} for descriptions of these are determined." - )) - } - target_date <- coerce_time_type(object$target_date, expected_time_type) + target_date <- object$target_date + validate_date(target_date, expected_time_type, + call = expr(layer_add_target_date()) + ) + target_date <- coerce_time_type(target_date, expected_time_type) } else if ( detect_layer(the_frosting, "layer_add_forecast_date") && - !is.null(possible_fd <- extract_argument( + !is.null(forecast_date <- extract_argument( the_frosting, "layer_add_forecast_date", "forecast_date" ))) { - check <- validate_date(possible_fd, expected_time_type) - if (!check$ok) { - cli::cli_abort(c( - "The `forecast_date` was given as a {.val {check$x}} while the", - `!` = "`time_type` of the training data was {.val {check$expected}}.", - i = "See {.topic epiprocess::epi_df} for how these are determined." - )) - } - forecast_date <- coerce_time_type(possible_fd, expected_time_type) + validate_date(forecast_date, expected_time_type, + call = expr(layer_add_forecast_date()) + ) + forecast_date <- coerce_time_type(forecast_date, expected_time_type) ahead <- extract_argument(the_recipe, "step_epi_ahead", "ahead") target_date <- forecast_date + ahead } else { @@ -126,6 +117,7 @@ slather.layer_add_target_date <- function(object, components, workflow, new_data target_date <- max_time_value + ahead } + object$target_date <- target_date components$predictions <- dplyr::bind_cols(components$predictions, target_date = target_date ) diff --git a/R/time_types.R b/R/time_types.R index 0d1fa0662..7fe3e47b4 100644 --- a/R/time_types.R +++ b/R/time_types.R @@ -59,8 +59,15 @@ coerce_time_type <- function(x, target_type) { ) } -validate_date <- function(x, expected) { - x <- guess_time_type(x) - ok <- x == expected - enlist(ok, x, expected) +validate_date <- function(x, expected, arg = rlang::caller_arg(x), + call = rlang::caller_env()) { + time_type_x <- guess_time_type(x) + ok <- time_type_x == expected + if (!ok) { + cli::cli_abort(c( + "The {.arg {arg}} was given as a {.val {time_type_x}} while the", + `!` = "`time_type` of the training data was {.val {expected}}.", + i = "See {.topic epiprocess::epi_df} for descriptions of these are determined." + ), call = call) + } }