From dff767efb88069f5795310cc9b3be789a251a98f Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <daniel@stat.ubc.ca>
Date: Thu, 7 Mar 2024 15:27:44 -0800
Subject: [PATCH 01/25] add period utils

---
 NAMESPACE  |  1 +
 R/period.R | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+)
 create mode 100644 R/period.R

diff --git a/NAMESPACE b/NAMESPACE
index 3c63145b3..89a522d66 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -43,6 +43,7 @@ S3method(fit,epi_workflow)
 S3method(flusight_hub_formatter,canned_epipred)
 S3method(flusight_hub_formatter,data.frame)
 S3method(format,dist_quantiles)
+S3method(format,period)
 S3method(is.na,dist_quantiles)
 S3method(is.na,distribution)
 S3method(mean,dist_quantiles)
diff --git a/R/period.R b/R/period.R
new file mode 100644
index 000000000..af34a5896
--- /dev/null
+++ b/R/period.R
@@ -0,0 +1,41 @@
+new_period <- function(x) {
+  arg_is_scalar(x)
+  arg_is_chr(x)
+
+  n <- as.integer(strextract("^[0-9]+", x))
+  names_in <- tolower(strextract("[a-zA-Z]+$", x))
+  names_allowed <- paste0(rlang::fn_fmls_names(default_period), "s")
+  if (length(n) == 0L || length(names_in) == 0L ||
+    is.na(pmatch(names_in, names_allowed))) {
+    cli_abort(c(
+      "Requested periodicity {.var {names_in}} is not available.",
+      i = "Input must be a positive integer followed by one of {.val {names_allowed}}."
+    ))
+  }
+  names_in <- gsub("s$", "", names_in)
+  l <- rlang::list2(!!names_in := n)
+  res <- eval(rlang::call2("default_period", !!!l))
+  vctrs::new_rcrd(res, class = "period")
+}
+
+default_period <- function(year = 0L, quarter = 0L, month = 0L, epiweek = 0L,
+                           week = 0L, day = 0L, hours = 0L, minutes = 0L,
+                           seconds = 0L) {
+  enlist(
+    year = year, month = month + 3L * quarter, epiweek = epiweek,
+    day = day + 7L * week, hours = hours, minutes = minutes, seconds = seconds
+  )
+}
+
+#' @method format period
+#' @export
+format.period <- function(x, ...) {
+  nms <- c("Y", "M", "EW", "D", "h", "m", "s")
+  val <- vctrs::vec_c(!!!vctrs::vec_data(x))
+  paste0(val[val != 0], nms[val != 0])
+}
+
+strextract <- function(pattern, x) {
+  m <- regexec(pattern, x)
+  unlist(regmatches(x, m))
+}

From 06678456e80906644b21365ee12753318d0007e4 Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <dajmcdon@gmail.com>
Date: Fri, 8 Mar 2024 14:35:55 -0800
Subject: [PATCH 02/25] ensure joins happen silently

---
 R/layer_population_scaling.R | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/R/layer_population_scaling.R b/R/layer_population_scaling.R
index 2b7057bef..750ced0ee 100644
--- a/R/layer_population_scaling.R
+++ b/R/layer_population_scaling.R
@@ -144,6 +144,12 @@ slather.layer_population_scaling <-
         length(object$df_pop_col) == 1
     )
 
+    if (is.null(object$by)) {
+      object$by <- intersect(
+        kill_time_value(epi_keys(components$predictions)),
+        colnames(dplyr::select(object$df, !object$df_pop_col))
+      )
+    }
     try_join <- try(
       dplyr::left_join(components$predictions, object$df,
         by = object$by

From edd5134144c9a6c0b7d93e3bc615c2c77d20eda6 Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <dajmcdon@gmail.com>
Date: Sat, 9 Mar 2024 07:40:54 -0800
Subject: [PATCH 03/25] fix time_type processing in forecast/target date

---
 R/epi_workflow.R             |  7 +++--
 R/layer_add_forecast_date.R  | 27 ++++++++++---------
 R/layer_add_target_date.R    | 52 +++++++++++++++++++++++-------------
 R/layer_population_scaling.R |  5 ++--
 R/period.R                   | 41 ----------------------------
 R/step_epi_shift.R           | 21 ++++++++-------
 R/time_types.R               | 46 +++++++++++++++++++++++++++++++
 7 files changed, 110 insertions(+), 89 deletions(-)
 delete mode 100644 R/period.R
 create mode 100644 R/time_types.R

diff --git a/R/epi_workflow.R b/R/epi_workflow.R
index d5e7d13a2..0fca5dc6c 100644
--- a/R/epi_workflow.R
+++ b/R/epi_workflow.R
@@ -251,11 +251,10 @@ fit.epi_workflow <- function(object, data, ..., control = workflows::control_wor
 #' preds
 predict.epi_workflow <- function(object, new_data, ...) {
   if (!workflows::is_trained_workflow(object)) {
-    rlang::abort(
-      c("Can't predict on an untrained epi_workflow.",
+    cli::cli_abort(c(
+      "Can't predict on an untrained epi_workflow.",
         i = "Do you need to call `fit()`?"
-      )
-    )
+    ))
   }
   components <- list()
   components$mold <- workflows::extract_mold(object)
diff --git a/R/layer_add_forecast_date.R b/R/layer_add_forecast_date.R
index 6bb2cf572..ea3635d79 100644
--- a/R/layer_add_forecast_date.R
+++ b/R/layer_add_forecast_date.R
@@ -78,8 +78,8 @@ layer_add_forecast_date <-
   }
 
 layer_add_forecast_date_new <- function(forecast_date, id) {
-  forecast_date <- arg_to_date(forecast_date, allow_null = TRUE)
   arg_is_chr_scalar(id)
+  # can't validate forecast_date until we know the time_type
   layer("add_forecast_date", forecast_date = forecast_date, id = id)
 }
 
@@ -93,23 +93,24 @@ slather.layer_add_forecast_date <- function(object, components, workflow, new_da
     )
     object$forecast_date <- max_time_value
   }
-  as_of_pre <- attributes(workflows::extract_preprocessor(workflow)$template)$metadata$as_of
-  as_of_fit <- workflow$fit$meta$as_of
-  as_of_post <- attributes(new_data)$metadata$as_of
 
-  as_of_date <- as.Date(max(as_of_pre, as_of_fit, as_of_post))
+  expected_time_type <- attr(
+    workflows::extract_preprocessor(workflow)$template, "metadata"
+  )$time_type
+  if (expected_time_type == "week") expected_time_type = "day"
+  check <- validate_date(object$forecast_date, expected_time_type)
 
-  if (object$forecast_date < as_of_date) {
-    cli_warn(
-      c("The forecast_date is less than the most ",
-        "recent update date of the data: ",
-        i = "forecast_date = {object$forecast_date} while data is from {as_of_date}."
-      )
-    )
+  if (!check$ok) {
+    cli::cli_abort(c(
+      "The `forecast_date` was given as a {.val {check$x}} while the",
+      `!` = "`time_type` of the training data was {.val {check$expected}}.",
+      i = "See {.topic epiprocess::epi_df} for descriptions of these are determined."
+    ))
   }
+
   components$predictions <- dplyr::bind_cols(
     components$predictions,
-    forecast_date = as.Date(object$forecast_date)
+    forecast_date = coerce_time_type(object$forecast_date, expected_time_type)
   )
   components
 }
diff --git a/R/layer_add_target_date.R b/R/layer_add_target_date.R
index f2fee889f..6a48cc923 100644
--- a/R/layer_add_target_date.R
+++ b/R/layer_add_target_date.R
@@ -63,7 +63,7 @@
 #' p3
 layer_add_target_date <-
   function(frosting, target_date = NULL, id = rand_id("add_target_date")) {
-    target_date <- arg_to_date(target_date, allow_null = TRUE)
+    # can't validate target_date until we know the time_type
     arg_is_chr_scalar(id)
     add_layer(
       frosting,
@@ -84,33 +84,47 @@ slather.layer_add_target_date <- function(object, components, workflow, new_data
   the_recipe <- workflows::extract_recipe(workflow)
   the_frosting <- extract_frosting(workflow)
 
-  if (!is.null(object$target_date)) {
-    target_date <- as.Date(object$target_date)
-  } else { # null target date case
-    if (detect_layer(the_frosting, "layer_add_forecast_date") &&
-      !is.null(extract_argument(
-        the_frosting,
-        "layer_add_forecast_date", "forecast_date"
-      ))) {
-      forecast_date <- extract_argument(
-        the_frosting,
-        "layer_add_forecast_date", "forecast_date"
-      )
+  expected_time_type <- attr(
+    workflows::extract_preprocessor(workflow)$template, "metadata"
+  )$time_type
+  if (expected_time_type == "week") expected_time_type = "day"
 
-      ahead <- extract_argument(the_recipe, "step_epi_ahead", "ahead")
+  browser()
+  if (!is.null(object$target_date)) {
+    check <- validate_date(object$target_date, expected_time_type)
+    if (!check$ok) {
+      cli::cli_abort(c(
+        "The `target_date` was given as a {.val {check$x}} while the",
+        `!` = "`time_type` of the training data was {.val {check$expected}}.",
+        i = "See {.topic epiprocess::epi_df} for descriptions of these are determined."
+      ))
+    }
+    target_date <- coerce_time_type(object$target_date, expected_time_type)
+  } else if (
+    detect_layer(the_frosting, "layer_add_forecast_date") &&
+    !is.null(possible_fd <- extract_argument(
+      the_frosting, "layer_add_forecast_date", "forecast_date"
+    ))) {
 
-      target_date <- forecast_date + ahead
-    } else {
+    check <- validate_date(possible_fd, expected_time_type)
+    if (!check$ok) {
+      cli::cli_abort(c(
+        "The `forecast_date` was given as a {.val {check$x}} while the",
+        `!` = "`time_type` of the training data was {.val {check$expected}}.",
+        i = "See {.topic epiprocess::epi_df} for descriptions of these are determined."
+      ))
+    }
+    forecast_date <- coerce_time_type(fd, expected_time_type)
+    ahead <- extract_argument(the_recipe, "step_epi_ahead", "ahead")
+    target_date <- forecast_date + ahead
+  } else {
       max_time_value <- max(
         workflows::extract_preprocessor(workflow)$max_time_value,
         workflow$fit$meta$max_time_value,
         max(new_data$time_value)
       )
-
       ahead <- extract_argument(the_recipe, "step_epi_ahead", "ahead")
-
       target_date <- max_time_value + ahead
-    }
   }
 
   components$predictions <- dplyr::bind_cols(components$predictions,
diff --git a/R/layer_population_scaling.R b/R/layer_population_scaling.R
index 750ced0ee..1cdc35d7c 100644
--- a/R/layer_population_scaling.R
+++ b/R/layer_population_scaling.R
@@ -144,6 +144,7 @@ slather.layer_population_scaling <-
         length(object$df_pop_col) == 1
     )
 
+    browser()
     if (is.null(object$by)) {
       object$by <- intersect(
         kill_time_value(epi_keys(components$predictions)),
@@ -163,8 +164,8 @@ slather.layer_population_scaling <-
       ))
     }
 
-    object$df <- object$df %>%
-      dplyr::mutate(dplyr::across(tidyselect::where(is.character), tolower))
+    # object$df <- object$df %>%
+    #  dplyr::mutate(dplyr::across(tidyselect::where(is.character), tolower))
     pop_col <- rlang::sym(object$df_pop_col)
     exprs <- rlang::expr(c(!!!object$terms))
     pos <- tidyselect::eval_select(exprs, components$predictions)
diff --git a/R/period.R b/R/period.R
deleted file mode 100644
index af34a5896..000000000
--- a/R/period.R
+++ /dev/null
@@ -1,41 +0,0 @@
-new_period <- function(x) {
-  arg_is_scalar(x)
-  arg_is_chr(x)
-
-  n <- as.integer(strextract("^[0-9]+", x))
-  names_in <- tolower(strextract("[a-zA-Z]+$", x))
-  names_allowed <- paste0(rlang::fn_fmls_names(default_period), "s")
-  if (length(n) == 0L || length(names_in) == 0L ||
-    is.na(pmatch(names_in, names_allowed))) {
-    cli_abort(c(
-      "Requested periodicity {.var {names_in}} is not available.",
-      i = "Input must be a positive integer followed by one of {.val {names_allowed}}."
-    ))
-  }
-  names_in <- gsub("s$", "", names_in)
-  l <- rlang::list2(!!names_in := n)
-  res <- eval(rlang::call2("default_period", !!!l))
-  vctrs::new_rcrd(res, class = "period")
-}
-
-default_period <- function(year = 0L, quarter = 0L, month = 0L, epiweek = 0L,
-                           week = 0L, day = 0L, hours = 0L, minutes = 0L,
-                           seconds = 0L) {
-  enlist(
-    year = year, month = month + 3L * quarter, epiweek = epiweek,
-    day = day + 7L * week, hours = hours, minutes = minutes, seconds = seconds
-  )
-}
-
-#' @method format period
-#' @export
-format.period <- function(x, ...) {
-  nms <- c("Y", "M", "EW", "D", "h", "m", "s")
-  val <- vctrs::vec_c(!!!vctrs::vec_data(x))
-  paste0(val[val != 0], nms[val != 0])
-}
-
-strextract <- function(pattern, x) {
-  m <- regexec(pattern, x)
-  unlist(regmatches(x, m))
-}
diff --git a/R/step_epi_shift.R b/R/step_epi_shift.R
index ec5428d8f..94ef7178c 100644
--- a/R/step_epi_shift.R
+++ b/R/step_epi_shift.R
@@ -53,20 +53,20 @@
 step_epi_lag <-
   function(recipe,
            ...,
+           lag,
            role = "predictor",
            trained = FALSE,
-           lag,
            prefix = "lag_",
            default = NA,
            columns = NULL,
            skip = FALSE,
            id = rand_id("epi_lag")) {
     if (!is_epi_recipe(recipe)) {
-      rlang::abort("This recipe step can only operate on an `epi_recipe`.")
+      cli::cli_abort("This step can only operate on an `epi_recipe`.")
     }
 
     if (missing(lag)) {
-      rlang::abort(
+      cli::cli_abort(
         c("The `lag` argument must not be empty.",
           i = "Did you perhaps pass an integer in `...` accidentally?"
         )
@@ -75,7 +75,8 @@ step_epi_lag <-
     arg_is_nonneg_int(lag)
     arg_is_chr_scalar(prefix, id)
     if (!is.null(columns)) {
-      rlang::abort(c("The `columns` argument must be `NULL.",
+      cli::cli_abort(c(
+        "The `columns` argument must be `NULL.",
         i = "Use `tidyselect` methods to choose columns to lag."
       ))
     }
@@ -85,7 +86,7 @@ step_epi_lag <-
         terms = dplyr::enquos(...),
         role = role,
         trained = trained,
-        lag = lag,
+        lag = as.integer(lag),
         prefix = prefix,
         default = default,
         keys = epi_keys(recipe),
@@ -104,21 +105,21 @@ step_epi_lag <-
 step_epi_ahead <-
   function(recipe,
            ...,
+           ahead,
            role = "outcome",
            trained = FALSE,
-           ahead,
            prefix = "ahead_",
            default = NA,
            columns = NULL,
            skip = FALSE,
            id = rand_id("epi_ahead")) {
     if (!is_epi_recipe(recipe)) {
-      rlang::abort("This recipe step can only operate on an `epi_recipe`.")
+      cli::cli_abort("This step can only operate on an `epi_recipe`.")
     }
 
     if (missing(ahead)) {
-      rlang::abort(
-        c("The `ahead` argument must not be empty.",
+      cli::cli_abort(c(
+        "The `ahead` argument must not be empty.",
           i = "Did you perhaps pass an integer in `...` accidentally?"
         )
       )
@@ -136,7 +137,7 @@ step_epi_ahead <-
         terms = dplyr::enquos(...),
         role = role,
         trained = trained,
-        ahead = ahead,
+        ahead = as.integer(ahead),
         prefix = prefix,
         default = default,
         keys = epi_keys(recipe),
diff --git a/R/time_types.R b/R/time_types.R
new file mode 100644
index 000000000..02852696d
--- /dev/null
+++ b/R/time_types.R
@@ -0,0 +1,46 @@
+guess_time_type <- function(time_value) {
+  if (is.character(time_value)) {
+    if (nchar(time_value[1]) <= "10") {
+      new_time_value <- tryCatch({
+        as.Date(time_value)
+      }, error = function(e) NULL)
+    } else {
+      new_time_value <- tryCatch({
+        as.POSIXct(time_value)
+      }, error = function(e) NULL)
+    }
+    if (!is.null(new_time_value)) time_value <- new_time_value
+  }
+  if (inherits(time_value, "POSIXct")) return("day-time")
+  if (inherits(time_value, "Date")) return("day")
+  if (inherits(time_value, "yearweek")) return("yearweek")
+  if (inherits(time_value, "yearmonth")) return("yearmonth")
+  if (inherits(time_value, "yearquarter")) return("yearquarter")
+  if (is.numeric(time_value) && all(time_value == as.integer(time_value)) &&
+      all(time_value >= 1582)) {
+    return("year")
+  }
+  return("custom")
+}
+
+coerce_time_type <- function(x, target_type) {
+  if (target_type == "year") {
+    if (is.numeric(x)) return(as.integer(x))
+    else return(as.POSIXlt(x)$year + 1900L)
+  }
+  switch(
+    target_type,
+    "day-time" = as.POSIXct(x),
+    "day" = as.Date(x),
+    "week" = as.Date(x),
+    "yearweek" = tsibble::yearweek(x),
+    "yearmonth" = tsibble::yearmonth(x),
+    "yearquarter" = tsibble::yearquarter(x)
+  )
+}
+
+validate_date <- function(x, expected) {
+  x <- guess_time_type(x)
+  ok <- x == expected
+  enlist(ok, x, expected)
+}

From e3b2907db94784dc8636561bfdb543b9856d5384 Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <dajmcdon@gmail.com>
Date: Sat, 9 Mar 2024 07:41:05 -0800
Subject: [PATCH 04/25] import tsibble

---
 DESCRIPTION | 1 +
 NAMESPACE   | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index e068fe7e3..68a566cee 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -46,6 +46,7 @@ Imports:
     tibble,
     tidyr,
     tidyselect,
+    tsibble,
     usethis,
     vctrs,
     workflows (>= 1.0.0)
diff --git a/NAMESPACE b/NAMESPACE
index 89a522d66..3c63145b3 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -43,7 +43,6 @@ S3method(fit,epi_workflow)
 S3method(flusight_hub_formatter,canned_epipred)
 S3method(flusight_hub_formatter,data.frame)
 S3method(format,dist_quantiles)
-S3method(format,period)
 S3method(is.na,dist_quantiles)
 S3method(is.na,distribution)
 S3method(mean,dist_quantiles)

From a624b63244e71b2f45336ff47944760476be5239 Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <dajmcdon@gmail.com>
Date: Sat, 9 Mar 2024 07:41:26 -0800
Subject: [PATCH 05/25] use panel_data vignette for experiments, to be removed

---
 vignettes/panel-data.Rmd | 545 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 545 insertions(+)
 create mode 100644 vignettes/panel-data.Rmd

diff --git a/vignettes/panel-data.Rmd b/vignettes/panel-data.Rmd
new file mode 100644
index 000000000..39321ae4c
--- /dev/null
+++ b/vignettes/panel-data.Rmd
@@ -0,0 +1,545 @@
+---
+title: "Using epipredict on non-epidemic panel data"
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{Using epipredict on non-epidemic panel data}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r setup, include=F}
+knitr::opts_chunk$set(
+  echo = TRUE,
+  collapse = TRUE,
+  comment = "#>",
+  out.width = "100%"
+)
+```
+
+```{r libraries}
+library(dplyr)
+library(tidyr)
+library(parsnip)
+library(recipes)
+library(epiprocess)
+library(epipredict)
+library(ggplot2)
+theme_set(theme_bw())
+```
+
+[Panel data](https://en.wikipedia.org/wiki/Panel_data), or longitudinal data, 
+contain cross-sectional measurements of subjects over time. The `epipredict` 
+package is most suitable for running forecasters on epidemiological panel data. 
+A built-in example of this is the [`case_death_rate_subset`](
+  https://cmu-delphi.github.io/epipredict/reference/case_death_rate_subset.html) 
+dataset, which contains daily state-wise measures of `case_rate` and 
+`death_rate` for COVID-19 in 2021:
+
+```{r epi-panel-ex, include=T}
+head(case_death_rate_subset, 3)
+```
+
+`epipredict` functions work with data in 
+[`epi_df`](https://cmu-delphi.github.io/epiprocess/reference/epi_df.html) 
+format. Despite the stated goal and name of the package, other panel datasets 
+are also valid candidates for `epipredict` functionality, as long as they are 
+in `epi_df` format.
+
+```{r employ-stats, include=F}
+data("grad_employ_subset")
+year_start <- min(grad_employ_subset$time_value)
+year_end <- max(grad_employ_subset$time_value)
+```
+
+# Example panel data overview
+
+In this vignette, we will demonstrate using `epipredict` with employment panel
+data from Statistics Canada. We will be using 
+[
+  Table 37-10-0115-01: Characteristics and median employment income of 
+  longitudinal cohorts of postsecondary graduates two and five years after 
+  graduation, by educational qualification and field of study (primary 
+  groupings)
+](https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=3710011501). 
+
+The full dataset contains yearly median employment income two and five years 
+after graduation, and number of graduates. The data is stratified by 
+variables such as geographic region (Canadian province), education, and 
+age group. The year range of the dataset is `r year_start` to `r year_end`, 
+inclusive. The full dataset also contains metadata that describes the 
+quality of data collected. For demonstration purposes, we make the following 
+modifications to get a subset of the full dataset:
+
+* Only keep provincial-level geographic region (the full data also has 
+"Canada" as a region)
+* Only keep "good" or better quality data rows, as indicated by the [`STATUS`](
+  https://www.statcan.gc.ca/en/concepts/definitions/guide-symbol) column
+* Choose a subset of covariates and aggregate across the remaining ones. The 
+chosen covariates are age group, and educational qualification.
+
+Below is the query for obtaining the full data and code for subsetting it as we 
+just described:
+
+```{r employ-query, eval=F}
+library(cansim)
+
+# Get statcan data using get_cansim, which returns a tibble
+statcan_grad_employ <- get_cansim("37-10-0115-01")
+
+gemploy <- statcan_grad_employ %>%
+  # Drop some columns and rename the ones we keep
+  select(c(
+    "REF_DATE", "GEO", "VALUE", "STATUS", "Educational qualification",
+    "Field of study", "Gender", "Age group", "Status of student in Canada",
+    "Characteristics after graduation", "Graduate statistics"
+  )) %>%
+  rename(
+    "geo_value" = "GEO",
+    "time_value" = "REF_DATE",
+    "value" = "VALUE",
+    "status" = "STATUS",
+    "edu_qual" = "Educational qualification",
+    "fos" = "Field of study",
+    "gender" = "Gender",
+    "age_group" = "Age group",
+    "student_status" = "Status of student in Canada",
+    "grad_charac" = "Characteristics after graduation",
+    "grad_stat" = "Graduate statistics"
+  ) %>%
+  # The original `VALUE` column contain the statistic indicated by
+  # `Graduate statistics` in the original data. Below we pivot the data
+  # wider so that each unique statistic can have its own column.
+  mutate(
+    # Recode for easier pivoting
+    grad_stat = recode_factor(
+      grad_stat,
+      `Number of graduates` = "num_graduates",
+      `Median employment income two years after graduation` = "med_income_2y",
+      `Median employment income five years after graduation` = "med_income_5y"
+    ),
+    # They are originally strings but want ints for conversion to epi_df later
+    time_value = as.integer(time_value)
+  ) %>%
+  pivot_wider(names_from = grad_stat, values_from = value) %>%
+  filter(
+    # Drop aggregates for some columns
+    geo_value != "Canada" &
+      age_group != "15 to 64 years" &
+      edu_qual != "Total, educational qualification" &
+      # Keep aggregates for keys we don't want to keep
+      fos == "Total, field of study" &
+      gender == "Total, gender" &
+      student_status == "Canadian and international students" &
+      # Since we're looking at 2y and 5y employment income, the only
+      # characteristics remaining are:
+      # - Graduates reporting employment income
+      # - Graduates reporting wages, salaries, and commissions only
+      # For simplicity, keep the first one only
+      grad_charac == "Graduates reporting employment income" &
+      # Only keep "good" data
+      is.na(status) &
+      # Drop NA value rows
+      !is.na(num_graduates) & !is.na(med_income_2y) & !is.na(med_income_5y)
+  ) %>%
+  select(-c(status, gender, student_status, grad_charac, fos))
+```
+
+To use this data with `epipredict`, we need to convert it into `epi_df` format 
+using [`as_epi_df`](
+  https://cmu-delphi.github.io/epiprocess/reference/as_epi_df.html) 
+with additional keys. In our case, the additional keys are `age_group`,
+and `edu_qual`. Note that in the above modifications, we encoded `time_value` 
+as type `integer`. This lets us set `time_type = "year"`, and ensures that
+lag and ahead modifications later on are using the correct time units. See the
+[`epi_df` documentation](
+  https://cmu-delphi.github.io/epiprocess/reference/epi_df.html#time-types) for 
+a list of all the `type_type`s available.
+
+```{r convert-to-epidf, eval=F}
+grad_employ_subset <- gemploy %>%
+  tsibble::as_tsibble(
+    index = time_value,
+    key = c(geo_value, age_group, edu_qual)
+  ) %>%
+  as_epi_df(
+    geo_type = "custom", time_type = "year",
+    additional_metadata = c(other_keys = list("age_group", "edu_qual"))
+  )
+```
+
+```{r data-dim, include=F}
+employ_rowcount <- format(nrow(grad_employ_subset), big.mark = ",")
+employ_colcount <- length(names(grad_employ_subset))
+```
+
+Now, we are ready to use `grad_employ_subset` with `epipredict`. 
+Our `epi_df` contains `r employ_rowcount` rows and `r employ_colcount` columns.
+Here is a quick summary of the columns in our `epi_df`:
+
+* `time_value` (time value): year in `date` format 
+* `geo_value` (geo value): province in Canada
+* `num_graduates` (raw, time series value): number of graduates 
+* `med_income_2y` (raw, time series value): median employment income 2 years 
+after graduation 
+* `med_income_5y` (raw, time series value): median employment income 5 years 
+after graduation  
+* `age_group` (key): one of two age groups, either 15 to 34 years, or 35 to 64 
+years 
+* `edu_qual` (key): one of 32 unique educational qualifications, e.g., 
+"Master's diploma"
+
+```{r preview-data, include=T}
+# Rename for simplicity
+employ <- grad_employ_subset
+sample_n(employ, 6)
+```
+
+In the following sections, we will go over preprocessing the data in the 
+`epi_recipe` framework, and fitting a model and making predictions within the 
+`epipredict` framework and using the package's canned forecasters.
+
+# Simple autoregressive with 3 lags to predict number of graduates in a year 
+
+## Preprocessing 
+
+As a simple example, let's work with the `num_graduates` column for now. We will 
+first pre-process by "standardizing" each numeric column by the total within 
+each group of keys. We do this since those raw numeric values will vary greatly
+from province to province since there are large differences in population. 
+
+```{r employ-small, include=T}
+employ_small <- employ %>%
+  group_by(geo_value, age_group, edu_qual) %>%
+  # Select groups where there are complete timeseries values
+  filter(n() >= 6) %>%
+  mutate(
+    num_graduates_prop = num_graduates / sum(num_graduates),
+    med_income_2y_prop = med_income_2y / sum(med_income_2y),
+    med_income_5y_prop = med_income_5y / sum(med_income_5y)
+  ) %>%
+  ungroup()
+head(employ_small)
+```
+
+Below is a visualization for a sample of the small data. Note that some groups 
+do not have any time series information since we filtered out all timeseries 
+with incomplete dates.
+
+```{r employ-small-graph, include=T, eval=T, fig.width=9, fig.height=6}
+employ_small %>%
+  filter(geo_value %in% c("British Columbia", "Ontario")) %>%
+  filter(grepl("degree", edu_qual, fixed = T)) %>%
+  group_by(geo_value, time_value, edu_qual, age_group) %>%
+  summarise(num_graduates_prop = sum(num_graduates_prop), .groups = "drop") %>%
+  ggplot(aes(x = time_value, y = num_graduates_prop, color = geo_value)) +
+  geom_line() +
+  scale_colour_manual(values = c("Cornflowerblue", "Orange"), name = "") +
+  facet_grid(rows = vars(edu_qual), cols = vars(age_group)) +
+  xlab("Year") +
+  ylab("Percentage of gratuates") +
+  theme(legend.position = "bottom")
+```
+
+We will predict the "standardized" number of graduates (a proportion) in the 
+next year (time $t+1$) using an autoregressive model with three lags (i.e., an 
+AR(3) model). Such a model is represented algebraically like this: 
+
+\[
+  x_{t+1} = 
+  \alpha_0 + \alpha_1 x_{t} + \alpha_2 x_{t-1} + \alpha_3 x_{t-2} + \epsilon_t  
+\]
+
+where $x_i$ is the proportion of graduates at time $i$, and the current time is 
+$t$.
+
+In the preprocessing step, we need to create additional columns in `employ` for 
+each of $x_{t+1}$, $x_{t}$, $x_{t-1}$, and $x_{t-2}$. We do this via an 
+`epi_recipe`. Note that creating an `epi_recipe` alone doesn't add these 
+outcome and predictor columns; the recipe just stores the instructions for 
+adding them. 
+
+Our `epi_recipe` should add one `ahead` column representing $x_{t+1}$ and 
+3 `lag` columns representing $x_{t}$, $x_{t-1}$, and $x_{t-2}$. Also note that 
+since we specified our `time_type` to be `year`, our `lag` and `lead`
+values are both in years. 
+
+```{r make-recipe, include=T, eval=T}
+r <- epi_recipe(employ_small) %>%
+  step_epi_ahead(num_graduates_prop, ahead = 1) %>% 
+  step_epi_lag(num_graduates_prop, lag = 0:2) %>%
+  step_epi_naomit()
+r
+```
+
+Let's apply this recipe using `prep` and `bake` to generate and view the `lag` 
+and `ahead` columns.
+
+```{r view-preprocessed, include=T}
+# Display a sample of the preprocessed data
+bake_and_show_sample <- function(recipe, data, n = 5) {
+  recipe %>% prep(data) %>% bake(new_data = data) %>% sample_n(n)
+}
+
+r %>% bake_and_show_sample(employ_small)
+```
+
+We can see that the `prep` and `bake` steps created new columns according to 
+our `epi_recipe`:
+
+- `ahead_1_num_graduates_prop` corresponds to $x_{t+1}$
+- `lag_0_num_graduates_prop`, `lag_1_num_graduates_prop`, and 
+`lag_2_num_graduates_prop` correspond to $x_{t}$, $x_{t-1}$, and $x_{t-2}$ 
+respectively.
+
+## Model fitting and prediction
+
+Since our goal for now is to fit a simple autoregressive model, we can use
+[`parsnip::linear_reg()`](
+  https://parsnip.tidymodels.org/reference/linear_reg.html) with the default 
+engine `lm`, which fits a linear regression using ordinary least squares. 
+
+We will use `epi_workflow` with the `epi_recipe` we defined in the 
+preprocessing section along with the `parsnip::linear_reg()` model. Note again 
+that `epi_workflow` is a container and doesn't actually do the fitting. We have 
+to pass the workflow into `fit()` to get our model coefficients 
+$\alpha_i, i=0,...,3$.
+
+```{r linearreg-wf, include=T}
+wf_linreg <- epi_workflow(r, linear_reg()) %>%
+  fit(employ_small)
+summary(extract_fit_engine(wf_linreg))
+```
+
+This output tells us the coefficients of the fitted model; for instance, 
+the intercept is $\alpha_0 = 0.24804$ and the coefficient for $x_{t}$ is 
+$\alpha_1 = 0.06648$. The summary also tells us that only the intercept and lags
+at 2 years and 3 years ago have coefficients significantly greater than zero.
+
+Extracting the 95% confidence intervals for the coefficients also leads us to
+the same conclusion: all the coefficients except for $\alpha_1$ (lag 0) contain 
+0. 
+
+```{r}
+confint(extract_fit_engine(wf_linreg))
+```
+
+Now that we have our workflow, we can generate predictions from a subset of our 
+data. For this demo, we will predict the number of graduates using the last 2 
+years of our dataset.
+
+```{r linearreg-predict, include=T}
+latest <- get_test_data(recipe = r, x = employ_small)
+preds <- stats::predict(wf_linreg, latest) %>% filter(!is.na(.pred))
+# Display a sample of the prediction values, excluding NAs
+preds %>% sample_n(5)
+```
+
+We can do this using the `augment` function too. Note that `predict` and 
+`augment` both still return an `epi_df` with all of the keys that were present 
+in the original dataset.
+
+```{r linearreg-augment, include=T, eval=F}
+augment(wf_linreg, latest) %>% sample_n(5)
+```
+
+## Model diagnostics 
+
+First, we'll plot the residuals (that is, $y_{t} - \hat{y}_{t}$) against the 
+fitted values ($\hat{y}_{t}$).
+
+```{r lienarreg-resid-plot, include=T, fig.height=8}
+par(mfrow = c(2, 2), mar = c(5, 3.5, 1, 1) + .5)
+plot(extract_fit_engine(wf_linreg))
+```
+
+The fitted values vs. residuals plot shows us that the residuals are mostly 
+clustered around zero, but do not form an even band around the zero line, 
+indicating that the variance of the residuals is not constant. Additionally,
+the fitted values vs. square root of standardized residuals makes this more 
+obvious - the spread of the square root of standardized residuals varies with 
+the fitted values. 
+
+The Q-Q plot shows us that the residuals have heavier tails than a Normal 
+distribution. So the normality of residuals assumption doesn't hold either.
+
+Finally, the residuals vs. leverage plot shows us that we have a few influential
+points based on the Cooks distance (those outside the red dotted line). 
+
+Since we appear to be violating the linear model assumptions, we might consider 
+transforming our data differently, or considering a non-linear model, or 
+something else.
+
+# Autoregressive model with exogenous inputs
+
+Now suppose we want to model the 5-year employment income using 3 lags, while 
+also incorporating information from the other two time-series in our dataset: 
+the 2-year employment income and the number of graduates in the previous 2 
+years. We would do this using an autoregressive model with exogenous inputs,
+defined as follows:
+
+\[
+  y_{t+1} = 
+  \alpha_0 + \alpha_1 y_{t} + \alpha_2 y_{t-1} + \alpha_3 y_{t-2} +
+  \beta_1 x_{t} + \beta_2 x_{t-1} 
+  \gamma_2 z_{t} + \gamma_2 z_{t-1} 
+\]
+
+where $y_i$ is the 5-year median income (proportion) at time $i$, 
+$x_i$ is the 2-year median income (proportion) at time $i$, and 
+$z_i$ is the number of graduates (proportion) at time $i$. 
+
+## Preprocessing 
+
+Again, we construct an `epi_recipe` detailing the preprocessing steps.
+
+```{r custom-arx, include=T}
+rx <- epi_recipe(employ_small) %>%
+  step_epi_ahead(med_income_5y_prop, ahead = 1) %>%
+  # 5-year median income has 3 lags c(0,1,2)
+  step_epi_lag(med_income_5y_prop, lag = c(0, 1, 2)) %>%
+  # But the two exogenous variables have 2 lags c(0,1)
+  step_epi_lag(med_income_2y_prop, lag = c(0, 1)) %>%
+  step_epi_lag(num_graduates_prop, lag = c(0, 1)) %>%
+  step_epi_naomit()
+
+bake_and_show_sample(rx, employ_small)
+```
+
+## Model fitting & postprocessing
+
+Before fitting our model and making predictions, let's add some post-processing
+steps using a few [`frosting`](
+  https://cmu-delphi.github.io/epipredict/reference/frosting.html) layers to do 
+a few things:
+
+1. Convert our predictions back to income values and number of graduates, 
+rather than standardized proportions. We do this via the frosting layer 
+`layer_population_scaling`. 
+2. Threshold our predictions to 0. We are predicting proportions, which can't 
+be negative. And the transformed values back to dollars and people can't be 
+negative either.
+3. Generate prediction intervals based on residual quantiles, allowing us to 
+quantify the uncertainty associated with future predicted values.
+
+```{r custom-arx-post, include=T}
+# Create dataframe of the sums we used for standardizing
+# Only have to include med_income_5y since that is our outcome
+totals <- employ_small %>%
+  group_by(geo_value, age_group, edu_qual) %>%
+  summarise(med_income_5y_tot = sum(med_income_5y), .groups = "drop")
+
+# Define post-processing steps
+f <- frosting() %>%
+  layer_predict() %>%
+  layer_naomit(.pred) %>%
+  layer_threshold(.pred, lower = 0) %>%
+  # 90% prediction interval
+  layer_residual_quantiles(
+    quantile_levels = c(0.1, 0.9), 
+    symmetrize = FALSE
+  ) %>%
+  layer_population_scaling(
+    .pred, .pred_distn,
+    df = totals, df_pop_col = "med_income_5y_tot"
+  )
+
+wfx_linreg <- epi_workflow(rx, parsnip::linear_reg()) %>%
+  fit(employ_small) %>%
+  add_frosting(f)
+
+summary(extract_fit_engine(wfx_linreg))
+```
+
+Based on the summary output for this model, only the intercept term, 5-year 
+median income from 3 years ago, and the 2-year median income from 1 year ago 
+are significant linear predictors for today's 5-year median income at a 95% 
+confidence level. Both lags for the number of graduates were insignificant.
+
+Let's take a look at the predictions along with their 90% prediction intervals.
+
+```{r}
+latest <- get_test_data(recipe = rx, x = employ_small)
+predsx <- predict(wfx_linreg, latest)
+
+# Display values within prediction intervals
+predsx %>%
+  select(
+    geo_value, time_value, edu_qual, age_group, fos,
+    .pred_scaled, .pred_distn_scaled
+  ) %>%
+  head() %>%
+  pivot_quantiles_wider(.pred_distn_scaled)
+```
+
+# Using canned forecasters
+
+We've seen what we can do with non-epidemiological panel data using the 
+recipes frame, with `epi_recipe` for preprocessing, `epi_workflow` for model 
+fitting, and `frosting` for postprocessing. 
+
+`epipredict` also comes with canned forecasters that do all of those steps 
+behind the scenes for some simple models. Even though we aren't working with 
+epidemiological data, canned forecasters still work as expected, out of the box.
+We will demonstrate this with the simple
+[`flatline_forecaster`](
+  https://cmu-delphi.github.io/epipredict/reference/flatline_forecaster.html) 
+and the direct autoregressive (AR) forecaster 
+[`arx_forecaster`](
+  https://cmu-delphi.github.io/epipredict/reference/arx_forecaster.html).
+
+For both illustrations, we will continue to use the `employ_small` dataset 
+with the transformed numeric columns that are proportions within each group 
+by the keys in our `epi_df`.
+
+## Flatline forecaster 
+
+In this first example, we'll use `flatline_forecaster` to make a simple
+prediction of the 2-year median income for the next year, based on one previous 
+time point. This model is representated algebraically as:
+\[y_{t+1} = \alpha_0 + \alpha_0 y_{t}\]
+where $y_i$ is the 2-year median income (proportion) at time $i$.
+
+```{r flatline, include=T, warning=F}
+out_fl <- flatline_forecaster(employ_small, "med_income_2y_prop",
+  args_list = flatline_args_list(ahead = 1)
+)
+
+out_fl
+```
+
+## Autoregressive forecaster with exogenous inputs
+
+In this second example, we'll use `arx_forecaster` to make a prediction of the 
+5-year median income based using two lags, _and_ using two lags on two exogenous
+variables: 2-year median income and number of graduates. 
+
+The canned forecaster gives us a simple way of making this forecast since it 
+defines the recipe, workflow, and post-processing steps behind the scenes. This 
+is very similar to the model we introduced in the "Autoregressive Linear Model 
+with Exogenous Inputs" section of this article, but where all inputs have the 
+same number of lags.
+
+```{r arx-lr, include=T, warning=F}
+arx_args <- arx_args_list(lags = c(0L, 1L), ahead = 1L)
+
+out_arx_lr <- arx_forecaster(employ_small, "med_income_5y_prop",
+  c("med_income_5y_prop", "med_income_2y_prop", "num_graduates_prop"),
+  args_list = arx_args
+)
+
+out_arx_lr
+```
+
+Other changes to the direct AR forecaster, like changing the engine, also work 
+as expected. Below we use a boosted tree model instead of a linear regression.
+
+```{r arx-rf, include=T, warning=F}
+out_arx_rf <- arx_forecaster(
+  employ_small, "med_income_5y_prop",
+  c("med_income_5y_prop", "med_income_2y_prop", "num_graduates_prop"),
+  trainer = parsnip::boost_tree(mode = "regression", trees = 20),
+  args_list = arx_args
+)
+
+out_arx_rf
+```

From 69b21c67cb8b891b9377b0da1936e5eefa8205ed Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <dajmcdon@gmail.com>
Date: Sat, 9 Mar 2024 08:01:35 -0800
Subject: [PATCH 06/25] remove browser()

---
 R/layer_add_target_date.R    | 3 +--
 R/layer_population_scaling.R | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/R/layer_add_target_date.R b/R/layer_add_target_date.R
index 6a48cc923..8ab077efa 100644
--- a/R/layer_add_target_date.R
+++ b/R/layer_add_target_date.R
@@ -89,7 +89,6 @@ slather.layer_add_target_date <- function(object, components, workflow, new_data
   )$time_type
   if (expected_time_type == "week") expected_time_type = "day"
 
-  browser()
   if (!is.null(object$target_date)) {
     check <- validate_date(object$target_date, expected_time_type)
     if (!check$ok) {
@@ -114,7 +113,7 @@ slather.layer_add_target_date <- function(object, components, workflow, new_data
         i = "See {.topic epiprocess::epi_df} for descriptions of these are determined."
       ))
     }
-    forecast_date <- coerce_time_type(fd, expected_time_type)
+    forecast_date <- coerce_time_type(possible_fd, expected_time_type)
     ahead <- extract_argument(the_recipe, "step_epi_ahead", "ahead")
     target_date <- forecast_date + ahead
   } else {
diff --git a/R/layer_population_scaling.R b/R/layer_population_scaling.R
index 1cdc35d7c..f89160794 100644
--- a/R/layer_population_scaling.R
+++ b/R/layer_population_scaling.R
@@ -144,7 +144,6 @@ slather.layer_population_scaling <-
         length(object$df_pop_col) == 1
     )
 
-    browser()
     if (is.null(object$by)) {
       object$by <- intersect(
         kill_time_value(epi_keys(components$predictions)),

From 3498fcfdc614ccca36bdc8ebe7c5f0f4f9988c6e Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <dajmcdon@gmail.com>
Date: Sat, 9 Mar 2024 08:01:54 -0800
Subject: [PATCH 07/25] prefix with utils to avoid check warning

---
 R/dist_quantiles.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/dist_quantiles.R b/R/dist_quantiles.R
index 7f7af40f4..f20ebe3dc 100644
--- a/R/dist_quantiles.R
+++ b/R/dist_quantiles.R
@@ -233,10 +233,10 @@ quantile_extrapolate <- function(x, tau_out, middle) {
       dplyr::arrange(q)
   }
   if (any(indl)) {
-    qvals_out[indl] <- tail_extrapolate(tau_out[indl], head(qv, 2))
+    qvals_out[indl] <- tail_extrapolate(tau_out[indl], utils::head(qv, 2))
   }
   if (any(indr)) {
-    qvals_out[indr] <- tail_extrapolate(tau_out[indr], tail(qv, 2))
+    qvals_out[indr] <- tail_extrapolate(tau_out[indr], utils::tail(qv, 2))
   }
   qvals_out
 }

From b2b8134d033d7af825cef30e5743c33acee93ebe Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <dajmcdon@gmail.com>
Date: Sat, 9 Mar 2024 08:02:05 -0800
Subject: [PATCH 08/25] redocument

---
 man/step_epi_shift.Rd | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/man/step_epi_shift.Rd b/man/step_epi_shift.Rd
index bf135346e..f4419b831 100644
--- a/man/step_epi_shift.Rd
+++ b/man/step_epi_shift.Rd
@@ -8,9 +8,9 @@
 step_epi_lag(
   recipe,
   ...,
+  lag,
   role = "predictor",
   trained = FALSE,
-  lag,
   prefix = "lag_",
   default = NA,
   columns = NULL,
@@ -21,9 +21,9 @@ step_epi_lag(
 step_epi_ahead(
   recipe,
   ...,
+  ahead,
   role = "outcome",
   trained = FALSE,
-  ahead,
   prefix = "ahead_",
   default = NA,
   columns = NULL,
@@ -38,16 +38,16 @@ sequence of operations for this recipe.}
 \item{...}{One or more selector functions to choose variables
 for this step. See \code{\link[recipes:selections]{recipes::selections()}} for more details.}
 
+\item{lag, ahead}{A vector of integers. Each specified column will
+be the lag or lead for each value in the vector. Lag integers must be
+nonnegative, while ahead integers must be positive.}
+
 \item{role}{For model terms created by this step, what analysis role should
 they be assigned? \code{lag} is default a predictor while \code{ahead} is an outcome.}
 
 \item{trained}{A logical to indicate if the quantities for
 preprocessing have been estimated.}
 
-\item{lag, ahead}{A vector of integers. Each specified column will
-be the lag or lead for each value in the vector. Lag integers must be
-nonnegative, while ahead integers must be positive.}
-
 \item{prefix}{A prefix to indicate what type of variable this is}
 
 \item{default}{Determines what fills empty rows

From f2f39a264e5fbf96b9b7df7f7bdaba870c270147 Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <dajmcdon@gmail.com>
Date: Sat, 9 Mar 2024 08:02:39 -0800
Subject: [PATCH 09/25] ahead/lag will always be integer type now

---
 tests/testthat/test-extract_argument.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/testthat/test-extract_argument.R b/tests/testthat/test-extract_argument.R
index 0654304ba..3250b2991 100644
--- a/tests/testthat/test-extract_argument.R
+++ b/tests/testthat/test-extract_argument.R
@@ -43,19 +43,19 @@ test_that("recipe argument extractor works", {
 
   expect_error(extract_argument(r$steps[[1]], "uhoh", "bubble"))
   expect_error(extract_argument(r$steps[[1]], "step_epi_lag", "bubble"))
-  expect_identical(extract_argument(r$steps[[2]], "step_epi_ahead", "ahead"), 7)
+  expect_identical(extract_argument(r$steps[[2]], "step_epi_ahead", "ahead"), 7L)
 
 
   expect_error(extract_argument(r, "step_lightly", "quantile_levels"))
   expect_identical(
     extract_argument(r, "step_epi_lag", "lag"),
-    list(c(0, 7, 14), c(0, 7, 14))
+    list(c(0L, 7L, 14L), c(0L, 7L, 14L))
   )
 
   wf <- epi_workflow(preprocessor = r)
   expect_error(extract_argument(epi_workflow(), "step_epi_lag", "lag"))
   expect_identical(
     extract_argument(wf, "step_epi_lag", "lag"),
-    list(c(0, 7, 14), c(0, 7, 14))
+    list(c(0L, 7L, 14L), c(0L, 7L, 14L))
   )
 })

From f9859fe880a0209ab7d3f1c8c454c0a9c2b3f1df Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <dajmcdon@gmail.com>
Date: Sat, 9 Mar 2024 08:16:37 -0800
Subject: [PATCH 10/25] we no longer warn if as_of > forecast_date

---
 tests/testthat/test-layer_add_forecast_date.R | 25 +++++++++++--------
 tests/testthat/test-layer_add_target_date.R   |  3 ++-
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/tests/testthat/test-layer_add_forecast_date.R b/tests/testthat/test-layer_add_forecast_date.R
index 1830118dc..29a427c3a 100644
--- a/tests/testthat/test-layer_add_forecast_date.R
+++ b/tests/testthat/test-layer_add_forecast_date.R
@@ -11,8 +11,9 @@ latest <- jhu %>%
 
 test_that("layer validation works", {
   f <- frosting()
-  expect_error(layer_add_forecast_date(f, "a"))
-  expect_error(layer_add_forecast_date(f, "2022-05-31", id = c("a", "b")))
+
+
+  expect_error(layer_add_forecast_date(f, "2022-05-31", id = 2))
   expect_silent(layer_add_forecast_date(f, "2022-05-31"))
   expect_silent(layer_add_forecast_date(f))
   expect_silent(layer_add_forecast_date(f, as.Date("2022-05-31")))
@@ -41,10 +42,12 @@ test_that("Specify a `forecast_date` that is less than `as_of` date", {
     layer_naomit(.pred)
   wf2 <- wf %>% add_frosting(f2)
 
-  expect_warning(
-    p2 <- predict(wf2, latest),
-    "forecast_date is less than the most recent update date of the data."
-  )
+  # this warning has been removed
+  # expect_warning(
+  #   p2 <- predict(wf2, latest),
+  #   "forecast_date is less than the most recent update date of the data."
+  # )
+  expect_silent(p2 <- predict(wf2, latest))
   expect_equal(ncol(p2), 4L)
   expect_s3_class(p2, "epi_df")
   expect_equal(nrow(p2), 3L)
@@ -59,10 +62,12 @@ test_that("Do not specify a forecast_date in `layer_add_forecast_date()`", {
     layer_naomit(.pred)
   wf3 <- wf %>% add_frosting(f3)
 
-  expect_warning(
-    p3 <- predict(wf3, latest),
-    "forecast_date is less than the most recent update date of the data."
-  )
+  # this warning has been removed
+  # expect_warning(
+  #   p3 <- predict(wf3, latest),
+  #   "forecast_date is less than the most recent update date of the data."
+  # )
+  expect_silent(p3 <- predict(wf3, latest))
   expect_equal(ncol(p3), 4L)
   expect_s3_class(p3, "epi_df")
   expect_equal(nrow(p3), 3L)
diff --git a/tests/testthat/test-layer_add_target_date.R b/tests/testthat/test-layer_add_target_date.R
index 287956612..bf752e968 100644
--- a/tests/testthat/test-layer_add_target_date.R
+++ b/tests/testthat/test-layer_add_target_date.R
@@ -31,7 +31,8 @@ test_that("Use ahead + max time value from pre, fit, post", {
     layer_naomit(.pred)
   wf2 <- wf %>% add_frosting(f2)
 
-  expect_warning(p2 <- predict(wf2, latest))
+  # expect_warning(p2 <- predict(wf2, latest)) # this warning has been removed
+  expect_silent(p2 <- predict(wf2, latest))
   expect_equal(ncol(p2), 5L)
   expect_s3_class(p2, "epi_df")
   expect_equal(nrow(p2), 3L)

From fb7faae256731bcf51a351d0a3d268289c4df0ed Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <dajmcdon@gmail.com>
Date: Sat, 9 Mar 2024 08:17:01 -0800
Subject: [PATCH 11/25] upstream changes to prep() no require data

---
 tests/testthat/test-step_growth_rate.R    |  8 ++++----
 tests/testthat/test-step_lag_difference.R | 10 +++++-----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/testthat/test-step_growth_rate.R b/tests/testthat/test-step_growth_rate.R
index d0dec170e..052141710 100644
--- a/tests/testthat/test-step_growth_rate.R
+++ b/tests/testthat/test-step_growth_rate.R
@@ -34,7 +34,7 @@ test_that("step_growth_rate works for a single signal", {
 
   res <- r %>%
     step_growth_rate(value, horizon = 1) %>%
-    prep() %>%
+    prep(edf) %>%
     bake(edf)
   expect_equal(res$gr_1_rel_change_value, c(NA, 1 / 6:9))
 
@@ -46,7 +46,7 @@ test_that("step_growth_rate works for a single signal", {
   r <- epi_recipe(edf)
   res <- r %>%
     step_growth_rate(value, horizon = 1) %>%
-    prep() %>%
+    prep(edf) %>%
     bake(edf)
   expect_equal(res$gr_1_rel_change_value, rep(c(NA, 1 / 6:9), each = 2))
 })
@@ -63,7 +63,7 @@ test_that("step_growth_rate works for a two signals", {
 
   res <- r %>%
     step_growth_rate(v1, v2, horizon = 1) %>%
-    prep() %>%
+    prep(edf) %>%
     bake(edf)
   expect_equal(res$gr_1_rel_change_v1, c(NA, 1 / 6:9))
   expect_equal(res$gr_1_rel_change_v2, c(NA, 1 / 1:4))
@@ -76,7 +76,7 @@ test_that("step_growth_rate works for a two signals", {
   r <- epi_recipe(edf)
   res <- r %>%
     step_growth_rate(v1, v2, horizon = 1) %>%
-    prep() %>%
+    prep(edf) %>%
     bake(edf)
   expect_equal(res$gr_1_rel_change_v1, rep(c(NA, 1 / 6:9), each = 2))
   expect_equal(res$gr_1_rel_change_v2, rep(c(NA, 1 / 1:4), each = 2))
diff --git a/tests/testthat/test-step_lag_difference.R b/tests/testthat/test-step_lag_difference.R
index dc61d12d4..c0fd377e6 100644
--- a/tests/testthat/test-step_lag_difference.R
+++ b/tests/testthat/test-step_lag_difference.R
@@ -27,13 +27,13 @@ test_that("step_lag_difference works for a single signal", {
 
   res <- r %>%
     step_lag_difference(value, horizon = 1) %>%
-    prep() %>%
+    prep(edf) %>%
     bake(edf)
   expect_equal(res$lag_diff_1_value, c(NA, rep(1, 4)))
 
   res <- r %>%
     step_lag_difference(value, horizon = 1:2) %>%
-    prep() %>%
+    prep(edf) %>%
     bake(edf)
   expect_equal(res$lag_diff_1_value, c(NA, rep(1, 4)))
   expect_equal(res$lag_diff_2_value, c(NA, NA, rep(2, 3)))
@@ -48,7 +48,7 @@ test_that("step_lag_difference works for a single signal", {
   r <- epi_recipe(edf)
   res <- r %>%
     step_lag_difference(value, horizon = 1) %>%
-    prep() %>%
+    prep(edf) %>%
     bake(edf)
   expect_equal(res$lag_diff_1_value, c(NA, NA, rep(1, 8)))
 })
@@ -65,7 +65,7 @@ test_that("step_lag_difference works for a two signals", {
 
   res <- r %>%
     step_lag_difference(v1, v2, horizon = 1:2) %>%
-    prep() %>%
+    prep(edf) %>%
     bake(edf)
   expect_equal(res$lag_diff_1_v1, c(NA, rep(1, 4)))
   expect_equal(res$lag_diff_2_v1, c(NA, NA, rep(2, 3)))
@@ -80,7 +80,7 @@ test_that("step_lag_difference works for a two signals", {
   r <- epi_recipe(edf)
   res <- r %>%
     step_lag_difference(v1, v2, horizon = 1:2) %>%
-    prep() %>%
+    prep(edf) %>%
     bake(edf)
   expect_equal(res$lag_diff_1_v1, rep(c(NA, rep(1, 4)), each = 2))
   expect_equal(res$lag_diff_2_v1, rep(c(NA, NA, rep(2, 3)), each = 2))

From 78ff1f94b341758f24a80e3e236f0504e19418ab Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <dajmcdon@gmail.com>
Date: Sat, 9 Mar 2024 08:19:05 -0800
Subject: [PATCH 12/25] remove unused test

---
 tests/testthat/test-propagate_samples.R | 7 -------
 1 file changed, 7 deletions(-)
 delete mode 100644 tests/testthat/test-propagate_samples.R

diff --git a/tests/testthat/test-propagate_samples.R b/tests/testthat/test-propagate_samples.R
deleted file mode 100644
index 5278ab385..000000000
--- a/tests/testthat/test-propagate_samples.R
+++ /dev/null
@@ -1,7 +0,0 @@
-test_that("propagate_samples", {
-  r <- -30:50
-  p <- 40
-  quantiles <- 1:9 / 10
-  aheads <- c(2, 4, 7)
-  nsim <- 100
-})

From 3d059f42951db9f7a1e114b67d915712eda93046 Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <dajmcdon@gmail.com>
Date: Mon, 18 Mar 2024 11:32:19 -0700
Subject: [PATCH 13/25] always a scalar

---
 R/time_types.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/R/time_types.R b/R/time_types.R
index 02852696d..5f3aa4c14 100644
--- a/R/time_types.R
+++ b/R/time_types.R
@@ -1,6 +1,8 @@
 guess_time_type <- function(time_value) {
+  # similar to epiprocess:::guess_time_type() but w/o the gap handling
+  arg_is_scalar(time_value)
   if (is.character(time_value)) {
-    if (nchar(time_value[1]) <= "10") {
+    if (nchar(time_value) <= "10") {
       new_time_value <- tryCatch({
         as.Date(time_value)
       }, error = function(e) NULL)

From de9e00a8163a9fc8028879d6c10b3bd37c408328 Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <dajmcdon@gmail.com>
Date: Mon, 18 Mar 2024 11:36:26 -0700
Subject: [PATCH 14/25] add some additional tests

---
 tests/testthat/test-layer_add_forecast_date.R | 32 +++++++++++++++++++
 tests/testthat/test-layer_add_target_date.R   | 32 +++++++++++++++++++
 2 files changed, 64 insertions(+)

diff --git a/tests/testthat/test-layer_add_forecast_date.R b/tests/testthat/test-layer_add_forecast_date.R
index 29a427c3a..93f13baf7 100644
--- a/tests/testthat/test-layer_add_forecast_date.R
+++ b/tests/testthat/test-layer_add_forecast_date.R
@@ -74,3 +74,35 @@ test_that("Do not specify a forecast_date in `layer_add_forecast_date()`", {
   expect_equal(p3$forecast_date, rep(as.Date("2021-12-31"), times = 3))
   expect_named(p3, c("geo_value", "time_value", ".pred", "forecast_date"))
 })
+
+
+test_that("forecast date works for daily", {
+  f <- frosting() %>%
+    layer_predict() %>%
+    layer_add_forecast_date() %>%
+    layer_naomit(.pred)
+
+  wf1 <- add_frosting(wf, f)
+  p <- predict(wf1, latest)
+  expect_identical(p$forecast_date[1], as.Date("2021-12-31"))
+
+  latest_bad <- latest %>%
+    unclass() %>%
+    as.data.frame() %>%
+    mutate(time_value = as.POSIXlt(time_value)$year + 1900L) %>%
+    as_epi_df()
+
+  expect_error(predict(wf1, latest_bad))
+  wf1 <- add_frosting(
+    wf,
+    adjust_frosting(f, "layer_add_forecast_date", forecast_date = "2022-01-01")
+  )
+  expect_silent(predict(wf1, latest))
+
+  wf1 <- add_frosting(
+    wf,
+    adjust_frosting(f, "layer_add_forecast_date", forecast_date = 2022L)
+  )
+  expect_error(predict(wf1, latest)) # wrong time type of forecast_date
+
+})
diff --git a/tests/testthat/test-layer_add_target_date.R b/tests/testthat/test-layer_add_target_date.R
index bf752e968..f3b706aa6 100644
--- a/tests/testthat/test-layer_add_target_date.R
+++ b/tests/testthat/test-layer_add_target_date.R
@@ -86,3 +86,35 @@ test_that("Specify own target date", {
   expect_equal(p2$target_date, rep(as.Date("2022-01-08"), times = 3))
   expect_named(p2, c("geo_value", "time_value", ".pred", "target_date"))
 })
+
+test_that("forecast date works for daily", {
+  f <- frosting() %>%
+    layer_predict() %>%
+    layer_add_target_date() %>%
+    layer_naomit(.pred)
+
+  wf1 <- add_frosting(wf, f)
+  p <- predict(wf1, latest)
+  expect_identical(p$target_date[1], as.Date("2021-12-31") + 7L)
+
+  latest_bad <- latest %>%
+    unclass() %>%
+    as.data.frame() %>%
+    mutate(time_value = as.POSIXlt(time_value)$year + 1900L) %>%
+    as_epi_df()
+
+  expect_error(predict(wf1, latest_bad))
+  wf1 <- add_frosting(
+    wf,
+    adjust_frosting(f, "layer_add_target_date", target_date = "2022-01-07")
+  )
+  expect_silent(predict(wf1, latest))
+
+  wf1 <- add_frosting(
+    wf,
+    adjust_frosting(f, "layer_add_target_date", target_date = 2022L)
+  )
+  expect_error(predict(wf1, latest)) # wrong time type of forecast_date
+
+})
+

From 0c3872360ed0b26a122d9ce911dd61f6fcb52ecc Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <dajmcdon@gmail.com>
Date: Mon, 18 Mar 2024 11:40:53 -0700
Subject: [PATCH 15/25] this shouldnt be on this branch

---
 vignettes/panel-data.Rmd | 545 ---------------------------------------
 1 file changed, 545 deletions(-)
 delete mode 100644 vignettes/panel-data.Rmd

diff --git a/vignettes/panel-data.Rmd b/vignettes/panel-data.Rmd
deleted file mode 100644
index 39321ae4c..000000000
--- a/vignettes/panel-data.Rmd
+++ /dev/null
@@ -1,545 +0,0 @@
----
-title: "Using epipredict on non-epidemic panel data"
-output: rmarkdown::html_vignette
-vignette: >
-  %\VignetteIndexEntry{Using epipredict on non-epidemic panel data}
-  %\VignetteEngine{knitr::rmarkdown}
-  %\VignetteEncoding{UTF-8}
----
-
-```{r setup, include=F}
-knitr::opts_chunk$set(
-  echo = TRUE,
-  collapse = TRUE,
-  comment = "#>",
-  out.width = "100%"
-)
-```
-
-```{r libraries}
-library(dplyr)
-library(tidyr)
-library(parsnip)
-library(recipes)
-library(epiprocess)
-library(epipredict)
-library(ggplot2)
-theme_set(theme_bw())
-```
-
-[Panel data](https://en.wikipedia.org/wiki/Panel_data), or longitudinal data, 
-contain cross-sectional measurements of subjects over time. The `epipredict` 
-package is most suitable for running forecasters on epidemiological panel data. 
-A built-in example of this is the [`case_death_rate_subset`](
-  https://cmu-delphi.github.io/epipredict/reference/case_death_rate_subset.html) 
-dataset, which contains daily state-wise measures of `case_rate` and 
-`death_rate` for COVID-19 in 2021:
-
-```{r epi-panel-ex, include=T}
-head(case_death_rate_subset, 3)
-```
-
-`epipredict` functions work with data in 
-[`epi_df`](https://cmu-delphi.github.io/epiprocess/reference/epi_df.html) 
-format. Despite the stated goal and name of the package, other panel datasets 
-are also valid candidates for `epipredict` functionality, as long as they are 
-in `epi_df` format.
-
-```{r employ-stats, include=F}
-data("grad_employ_subset")
-year_start <- min(grad_employ_subset$time_value)
-year_end <- max(grad_employ_subset$time_value)
-```
-
-# Example panel data overview
-
-In this vignette, we will demonstrate using `epipredict` with employment panel
-data from Statistics Canada. We will be using 
-[
-  Table 37-10-0115-01: Characteristics and median employment income of 
-  longitudinal cohorts of postsecondary graduates two and five years after 
-  graduation, by educational qualification and field of study (primary 
-  groupings)
-](https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=3710011501). 
-
-The full dataset contains yearly median employment income two and five years 
-after graduation, and number of graduates. The data is stratified by 
-variables such as geographic region (Canadian province), education, and 
-age group. The year range of the dataset is `r year_start` to `r year_end`, 
-inclusive. The full dataset also contains metadata that describes the 
-quality of data collected. For demonstration purposes, we make the following 
-modifications to get a subset of the full dataset:
-
-* Only keep provincial-level geographic region (the full data also has 
-"Canada" as a region)
-* Only keep "good" or better quality data rows, as indicated by the [`STATUS`](
-  https://www.statcan.gc.ca/en/concepts/definitions/guide-symbol) column
-* Choose a subset of covariates and aggregate across the remaining ones. The 
-chosen covariates are age group, and educational qualification.
-
-Below is the query for obtaining the full data and code for subsetting it as we 
-just described:
-
-```{r employ-query, eval=F}
-library(cansim)
-
-# Get statcan data using get_cansim, which returns a tibble
-statcan_grad_employ <- get_cansim("37-10-0115-01")
-
-gemploy <- statcan_grad_employ %>%
-  # Drop some columns and rename the ones we keep
-  select(c(
-    "REF_DATE", "GEO", "VALUE", "STATUS", "Educational qualification",
-    "Field of study", "Gender", "Age group", "Status of student in Canada",
-    "Characteristics after graduation", "Graduate statistics"
-  )) %>%
-  rename(
-    "geo_value" = "GEO",
-    "time_value" = "REF_DATE",
-    "value" = "VALUE",
-    "status" = "STATUS",
-    "edu_qual" = "Educational qualification",
-    "fos" = "Field of study",
-    "gender" = "Gender",
-    "age_group" = "Age group",
-    "student_status" = "Status of student in Canada",
-    "grad_charac" = "Characteristics after graduation",
-    "grad_stat" = "Graduate statistics"
-  ) %>%
-  # The original `VALUE` column contain the statistic indicated by
-  # `Graduate statistics` in the original data. Below we pivot the data
-  # wider so that each unique statistic can have its own column.
-  mutate(
-    # Recode for easier pivoting
-    grad_stat = recode_factor(
-      grad_stat,
-      `Number of graduates` = "num_graduates",
-      `Median employment income two years after graduation` = "med_income_2y",
-      `Median employment income five years after graduation` = "med_income_5y"
-    ),
-    # They are originally strings but want ints for conversion to epi_df later
-    time_value = as.integer(time_value)
-  ) %>%
-  pivot_wider(names_from = grad_stat, values_from = value) %>%
-  filter(
-    # Drop aggregates for some columns
-    geo_value != "Canada" &
-      age_group != "15 to 64 years" &
-      edu_qual != "Total, educational qualification" &
-      # Keep aggregates for keys we don't want to keep
-      fos == "Total, field of study" &
-      gender == "Total, gender" &
-      student_status == "Canadian and international students" &
-      # Since we're looking at 2y and 5y employment income, the only
-      # characteristics remaining are:
-      # - Graduates reporting employment income
-      # - Graduates reporting wages, salaries, and commissions only
-      # For simplicity, keep the first one only
-      grad_charac == "Graduates reporting employment income" &
-      # Only keep "good" data
-      is.na(status) &
-      # Drop NA value rows
-      !is.na(num_graduates) & !is.na(med_income_2y) & !is.na(med_income_5y)
-  ) %>%
-  select(-c(status, gender, student_status, grad_charac, fos))
-```
-
-To use this data with `epipredict`, we need to convert it into `epi_df` format 
-using [`as_epi_df`](
-  https://cmu-delphi.github.io/epiprocess/reference/as_epi_df.html) 
-with additional keys. In our case, the additional keys are `age_group`,
-and `edu_qual`. Note that in the above modifications, we encoded `time_value` 
-as type `integer`. This lets us set `time_type = "year"`, and ensures that
-lag and ahead modifications later on are using the correct time units. See the
-[`epi_df` documentation](
-  https://cmu-delphi.github.io/epiprocess/reference/epi_df.html#time-types) for 
-a list of all the `type_type`s available.
-
-```{r convert-to-epidf, eval=F}
-grad_employ_subset <- gemploy %>%
-  tsibble::as_tsibble(
-    index = time_value,
-    key = c(geo_value, age_group, edu_qual)
-  ) %>%
-  as_epi_df(
-    geo_type = "custom", time_type = "year",
-    additional_metadata = c(other_keys = list("age_group", "edu_qual"))
-  )
-```
-
-```{r data-dim, include=F}
-employ_rowcount <- format(nrow(grad_employ_subset), big.mark = ",")
-employ_colcount <- length(names(grad_employ_subset))
-```
-
-Now, we are ready to use `grad_employ_subset` with `epipredict`. 
-Our `epi_df` contains `r employ_rowcount` rows and `r employ_colcount` columns.
-Here is a quick summary of the columns in our `epi_df`:
-
-* `time_value` (time value): year in `date` format 
-* `geo_value` (geo value): province in Canada
-* `num_graduates` (raw, time series value): number of graduates 
-* `med_income_2y` (raw, time series value): median employment income 2 years 
-after graduation 
-* `med_income_5y` (raw, time series value): median employment income 5 years 
-after graduation  
-* `age_group` (key): one of two age groups, either 15 to 34 years, or 35 to 64 
-years 
-* `edu_qual` (key): one of 32 unique educational qualifications, e.g., 
-"Master's diploma"
-
-```{r preview-data, include=T}
-# Rename for simplicity
-employ <- grad_employ_subset
-sample_n(employ, 6)
-```
-
-In the following sections, we will go over preprocessing the data in the 
-`epi_recipe` framework, and fitting a model and making predictions within the 
-`epipredict` framework and using the package's canned forecasters.
-
-# Simple autoregressive with 3 lags to predict number of graduates in a year 
-
-## Preprocessing 
-
-As a simple example, let's work with the `num_graduates` column for now. We will 
-first pre-process by "standardizing" each numeric column by the total within 
-each group of keys. We do this since those raw numeric values will vary greatly
-from province to province since there are large differences in population. 
-
-```{r employ-small, include=T}
-employ_small <- employ %>%
-  group_by(geo_value, age_group, edu_qual) %>%
-  # Select groups where there are complete timeseries values
-  filter(n() >= 6) %>%
-  mutate(
-    num_graduates_prop = num_graduates / sum(num_graduates),
-    med_income_2y_prop = med_income_2y / sum(med_income_2y),
-    med_income_5y_prop = med_income_5y / sum(med_income_5y)
-  ) %>%
-  ungroup()
-head(employ_small)
-```
-
-Below is a visualization for a sample of the small data. Note that some groups 
-do not have any time series information since we filtered out all timeseries 
-with incomplete dates.
-
-```{r employ-small-graph, include=T, eval=T, fig.width=9, fig.height=6}
-employ_small %>%
-  filter(geo_value %in% c("British Columbia", "Ontario")) %>%
-  filter(grepl("degree", edu_qual, fixed = T)) %>%
-  group_by(geo_value, time_value, edu_qual, age_group) %>%
-  summarise(num_graduates_prop = sum(num_graduates_prop), .groups = "drop") %>%
-  ggplot(aes(x = time_value, y = num_graduates_prop, color = geo_value)) +
-  geom_line() +
-  scale_colour_manual(values = c("Cornflowerblue", "Orange"), name = "") +
-  facet_grid(rows = vars(edu_qual), cols = vars(age_group)) +
-  xlab("Year") +
-  ylab("Percentage of gratuates") +
-  theme(legend.position = "bottom")
-```
-
-We will predict the "standardized" number of graduates (a proportion) in the 
-next year (time $t+1$) using an autoregressive model with three lags (i.e., an 
-AR(3) model). Such a model is represented algebraically like this: 
-
-\[
-  x_{t+1} = 
-  \alpha_0 + \alpha_1 x_{t} + \alpha_2 x_{t-1} + \alpha_3 x_{t-2} + \epsilon_t  
-\]
-
-where $x_i$ is the proportion of graduates at time $i$, and the current time is 
-$t$.
-
-In the preprocessing step, we need to create additional columns in `employ` for 
-each of $x_{t+1}$, $x_{t}$, $x_{t-1}$, and $x_{t-2}$. We do this via an 
-`epi_recipe`. Note that creating an `epi_recipe` alone doesn't add these 
-outcome and predictor columns; the recipe just stores the instructions for 
-adding them. 
-
-Our `epi_recipe` should add one `ahead` column representing $x_{t+1}$ and 
-3 `lag` columns representing $x_{t}$, $x_{t-1}$, and $x_{t-2}$. Also note that 
-since we specified our `time_type` to be `year`, our `lag` and `lead`
-values are both in years. 
-
-```{r make-recipe, include=T, eval=T}
-r <- epi_recipe(employ_small) %>%
-  step_epi_ahead(num_graduates_prop, ahead = 1) %>% 
-  step_epi_lag(num_graduates_prop, lag = 0:2) %>%
-  step_epi_naomit()
-r
-```
-
-Let's apply this recipe using `prep` and `bake` to generate and view the `lag` 
-and `ahead` columns.
-
-```{r view-preprocessed, include=T}
-# Display a sample of the preprocessed data
-bake_and_show_sample <- function(recipe, data, n = 5) {
-  recipe %>% prep(data) %>% bake(new_data = data) %>% sample_n(n)
-}
-
-r %>% bake_and_show_sample(employ_small)
-```
-
-We can see that the `prep` and `bake` steps created new columns according to 
-our `epi_recipe`:
-
-- `ahead_1_num_graduates_prop` corresponds to $x_{t+1}$
-- `lag_0_num_graduates_prop`, `lag_1_num_graduates_prop`, and 
-`lag_2_num_graduates_prop` correspond to $x_{t}$, $x_{t-1}$, and $x_{t-2}$ 
-respectively.
-
-## Model fitting and prediction
-
-Since our goal for now is to fit a simple autoregressive model, we can use
-[`parsnip::linear_reg()`](
-  https://parsnip.tidymodels.org/reference/linear_reg.html) with the default 
-engine `lm`, which fits a linear regression using ordinary least squares. 
-
-We will use `epi_workflow` with the `epi_recipe` we defined in the 
-preprocessing section along with the `parsnip::linear_reg()` model. Note again 
-that `epi_workflow` is a container and doesn't actually do the fitting. We have 
-to pass the workflow into `fit()` to get our model coefficients 
-$\alpha_i, i=0,...,3$.
-
-```{r linearreg-wf, include=T}
-wf_linreg <- epi_workflow(r, linear_reg()) %>%
-  fit(employ_small)
-summary(extract_fit_engine(wf_linreg))
-```
-
-This output tells us the coefficients of the fitted model; for instance, 
-the intercept is $\alpha_0 = 0.24804$ and the coefficient for $x_{t}$ is 
-$\alpha_1 = 0.06648$. The summary also tells us that only the intercept and lags
-at 2 years and 3 years ago have coefficients significantly greater than zero.
-
-Extracting the 95% confidence intervals for the coefficients also leads us to
-the same conclusion: all the coefficients except for $\alpha_1$ (lag 0) contain 
-0. 
-
-```{r}
-confint(extract_fit_engine(wf_linreg))
-```
-
-Now that we have our workflow, we can generate predictions from a subset of our 
-data. For this demo, we will predict the number of graduates using the last 2 
-years of our dataset.
-
-```{r linearreg-predict, include=T}
-latest <- get_test_data(recipe = r, x = employ_small)
-preds <- stats::predict(wf_linreg, latest) %>% filter(!is.na(.pred))
-# Display a sample of the prediction values, excluding NAs
-preds %>% sample_n(5)
-```
-
-We can do this using the `augment` function too. Note that `predict` and 
-`augment` both still return an `epi_df` with all of the keys that were present 
-in the original dataset.
-
-```{r linearreg-augment, include=T, eval=F}
-augment(wf_linreg, latest) %>% sample_n(5)
-```
-
-## Model diagnostics 
-
-First, we'll plot the residuals (that is, $y_{t} - \hat{y}_{t}$) against the 
-fitted values ($\hat{y}_{t}$).
-
-```{r lienarreg-resid-plot, include=T, fig.height=8}
-par(mfrow = c(2, 2), mar = c(5, 3.5, 1, 1) + .5)
-plot(extract_fit_engine(wf_linreg))
-```
-
-The fitted values vs. residuals plot shows us that the residuals are mostly 
-clustered around zero, but do not form an even band around the zero line, 
-indicating that the variance of the residuals is not constant. Additionally,
-the fitted values vs. square root of standardized residuals makes this more 
-obvious - the spread of the square root of standardized residuals varies with 
-the fitted values. 
-
-The Q-Q plot shows us that the residuals have heavier tails than a Normal 
-distribution. So the normality of residuals assumption doesn't hold either.
-
-Finally, the residuals vs. leverage plot shows us that we have a few influential
-points based on the Cooks distance (those outside the red dotted line). 
-
-Since we appear to be violating the linear model assumptions, we might consider 
-transforming our data differently, or considering a non-linear model, or 
-something else.
-
-# Autoregressive model with exogenous inputs
-
-Now suppose we want to model the 5-year employment income using 3 lags, while 
-also incorporating information from the other two time-series in our dataset: 
-the 2-year employment income and the number of graduates in the previous 2 
-years. We would do this using an autoregressive model with exogenous inputs,
-defined as follows:
-
-\[
-  y_{t+1} = 
-  \alpha_0 + \alpha_1 y_{t} + \alpha_2 y_{t-1} + \alpha_3 y_{t-2} +
-  \beta_1 x_{t} + \beta_2 x_{t-1} 
-  \gamma_2 z_{t} + \gamma_2 z_{t-1} 
-\]
-
-where $y_i$ is the 5-year median income (proportion) at time $i$, 
-$x_i$ is the 2-year median income (proportion) at time $i$, and 
-$z_i$ is the number of graduates (proportion) at time $i$. 
-
-## Preprocessing 
-
-Again, we construct an `epi_recipe` detailing the preprocessing steps.
-
-```{r custom-arx, include=T}
-rx <- epi_recipe(employ_small) %>%
-  step_epi_ahead(med_income_5y_prop, ahead = 1) %>%
-  # 5-year median income has 3 lags c(0,1,2)
-  step_epi_lag(med_income_5y_prop, lag = c(0, 1, 2)) %>%
-  # But the two exogenous variables have 2 lags c(0,1)
-  step_epi_lag(med_income_2y_prop, lag = c(0, 1)) %>%
-  step_epi_lag(num_graduates_prop, lag = c(0, 1)) %>%
-  step_epi_naomit()
-
-bake_and_show_sample(rx, employ_small)
-```
-
-## Model fitting & postprocessing
-
-Before fitting our model and making predictions, let's add some post-processing
-steps using a few [`frosting`](
-  https://cmu-delphi.github.io/epipredict/reference/frosting.html) layers to do 
-a few things:
-
-1. Convert our predictions back to income values and number of graduates, 
-rather than standardized proportions. We do this via the frosting layer 
-`layer_population_scaling`. 
-2. Threshold our predictions to 0. We are predicting proportions, which can't 
-be negative. And the transformed values back to dollars and people can't be 
-negative either.
-3. Generate prediction intervals based on residual quantiles, allowing us to 
-quantify the uncertainty associated with future predicted values.
-
-```{r custom-arx-post, include=T}
-# Create dataframe of the sums we used for standardizing
-# Only have to include med_income_5y since that is our outcome
-totals <- employ_small %>%
-  group_by(geo_value, age_group, edu_qual) %>%
-  summarise(med_income_5y_tot = sum(med_income_5y), .groups = "drop")
-
-# Define post-processing steps
-f <- frosting() %>%
-  layer_predict() %>%
-  layer_naomit(.pred) %>%
-  layer_threshold(.pred, lower = 0) %>%
-  # 90% prediction interval
-  layer_residual_quantiles(
-    quantile_levels = c(0.1, 0.9), 
-    symmetrize = FALSE
-  ) %>%
-  layer_population_scaling(
-    .pred, .pred_distn,
-    df = totals, df_pop_col = "med_income_5y_tot"
-  )
-
-wfx_linreg <- epi_workflow(rx, parsnip::linear_reg()) %>%
-  fit(employ_small) %>%
-  add_frosting(f)
-
-summary(extract_fit_engine(wfx_linreg))
-```
-
-Based on the summary output for this model, only the intercept term, 5-year 
-median income from 3 years ago, and the 2-year median income from 1 year ago 
-are significant linear predictors for today's 5-year median income at a 95% 
-confidence level. Both lags for the number of graduates were insignificant.
-
-Let's take a look at the predictions along with their 90% prediction intervals.
-
-```{r}
-latest <- get_test_data(recipe = rx, x = employ_small)
-predsx <- predict(wfx_linreg, latest)
-
-# Display values within prediction intervals
-predsx %>%
-  select(
-    geo_value, time_value, edu_qual, age_group, fos,
-    .pred_scaled, .pred_distn_scaled
-  ) %>%
-  head() %>%
-  pivot_quantiles_wider(.pred_distn_scaled)
-```
-
-# Using canned forecasters
-
-We've seen what we can do with non-epidemiological panel data using the 
-recipes frame, with `epi_recipe` for preprocessing, `epi_workflow` for model 
-fitting, and `frosting` for postprocessing. 
-
-`epipredict` also comes with canned forecasters that do all of those steps 
-behind the scenes for some simple models. Even though we aren't working with 
-epidemiological data, canned forecasters still work as expected, out of the box.
-We will demonstrate this with the simple
-[`flatline_forecaster`](
-  https://cmu-delphi.github.io/epipredict/reference/flatline_forecaster.html) 
-and the direct autoregressive (AR) forecaster 
-[`arx_forecaster`](
-  https://cmu-delphi.github.io/epipredict/reference/arx_forecaster.html).
-
-For both illustrations, we will continue to use the `employ_small` dataset 
-with the transformed numeric columns that are proportions within each group 
-by the keys in our `epi_df`.
-
-## Flatline forecaster 
-
-In this first example, we'll use `flatline_forecaster` to make a simple
-prediction of the 2-year median income for the next year, based on one previous 
-time point. This model is representated algebraically as:
-\[y_{t+1} = \alpha_0 + \alpha_0 y_{t}\]
-where $y_i$ is the 2-year median income (proportion) at time $i$.
-
-```{r flatline, include=T, warning=F}
-out_fl <- flatline_forecaster(employ_small, "med_income_2y_prop",
-  args_list = flatline_args_list(ahead = 1)
-)
-
-out_fl
-```
-
-## Autoregressive forecaster with exogenous inputs
-
-In this second example, we'll use `arx_forecaster` to make a prediction of the 
-5-year median income based using two lags, _and_ using two lags on two exogenous
-variables: 2-year median income and number of graduates. 
-
-The canned forecaster gives us a simple way of making this forecast since it 
-defines the recipe, workflow, and post-processing steps behind the scenes. This 
-is very similar to the model we introduced in the "Autoregressive Linear Model 
-with Exogenous Inputs" section of this article, but where all inputs have the 
-same number of lags.
-
-```{r arx-lr, include=T, warning=F}
-arx_args <- arx_args_list(lags = c(0L, 1L), ahead = 1L)
-
-out_arx_lr <- arx_forecaster(employ_small, "med_income_5y_prop",
-  c("med_income_5y_prop", "med_income_2y_prop", "num_graduates_prop"),
-  args_list = arx_args
-)
-
-out_arx_lr
-```
-
-Other changes to the direct AR forecaster, like changing the engine, also work 
-as expected. Below we use a boosted tree model instead of a linear regression.
-
-```{r arx-rf, include=T, warning=F}
-out_arx_rf <- arx_forecaster(
-  employ_small, "med_income_5y_prop",
-  c("med_income_5y_prop", "med_income_2y_prop", "num_graduates_prop"),
-  trainer = parsnip::boost_tree(mode = "regression", trees = 20),
-  args_list = arx_args
-)
-
-out_arx_rf
-```

From e036b8d5d6713b9522607c96ce62d05a276ae83a Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <dajmcdon@gmail.com>
Date: Mon, 18 Mar 2024 11:44:39 -0700
Subject: [PATCH 16/25] abort message typo

---
 R/layer_add_forecast_date.R | 2 +-
 R/layer_add_target_date.R   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/layer_add_forecast_date.R b/R/layer_add_forecast_date.R
index ea3635d79..31196f51c 100644
--- a/R/layer_add_forecast_date.R
+++ b/R/layer_add_forecast_date.R
@@ -104,7 +104,7 @@ slather.layer_add_forecast_date <- function(object, components, workflow, new_da
     cli::cli_abort(c(
       "The `forecast_date` was given as a {.val {check$x}} while the",
       `!` = "`time_type` of the training data was {.val {check$expected}}.",
-      i = "See {.topic epiprocess::epi_df} for descriptions of these are determined."
+      i = "See {.topic epiprocess::epi_df} for how these are determined."
     ))
   }
 
diff --git a/R/layer_add_target_date.R b/R/layer_add_target_date.R
index 8ab077efa..6a9affc71 100644
--- a/R/layer_add_target_date.R
+++ b/R/layer_add_target_date.R
@@ -110,7 +110,7 @@ slather.layer_add_target_date <- function(object, components, workflow, new_data
       cli::cli_abort(c(
         "The `forecast_date` was given as a {.val {check$x}} while the",
         `!` = "`time_type` of the training data was {.val {check$expected}}.",
-        i = "See {.topic epiprocess::epi_df} for descriptions of these are determined."
+        i = "See {.topic epiprocess::epi_df} for how these are determined."
       ))
     }
     forecast_date <- coerce_time_type(possible_fd, expected_time_type)

From e6e60cd7b5305bcc3af65e0dc6501234cb1225b1 Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <dajmcdon@gmail.com>
Date: Mon, 18 Mar 2024 11:49:36 -0700
Subject: [PATCH 17/25] bump version, add to news

---
 DESCRIPTION | 2 +-
 NEWS.md     | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 68a566cee..9329382fd 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: epipredict
 Title: Basic epidemiology forecasting methods
-Version: 0.0.10
+Version: 0.0.11
 Authors@R: c(
     person("Daniel", "McDonald", , "daniel@stat.ubc.ca", role = c("aut", "cre")),
     person("Ryan", "Tibshirani", , "ryantibs@cmu.edu", role = "aut"),
diff --git a/NEWS.md b/NEWS.md
index 04dc78e4f..5f629f45f 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -33,3 +33,5 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicat
 -   Working vignette
 -   use `checkmate` for input validation
 -   refactor quantile extrapolation (possibly creates different results)
+-   force `target_date` + `forecast_date` handling to match the time_type of 
+    the epi_df. allows for annual and weekly data

From 845c1a9ecd7162e727c8c9cf4ee11754afaba345 Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <dajmcdon@gmail.com>
Date: Mon, 18 Mar 2024 12:02:17 -0700
Subject: [PATCH 18/25] run styler

---
 R/epi_workflow.R                              |  2 +-
 R/layer_add_forecast_date.R                   |  2 +-
 R/layer_add_target_date.R                     | 23 ++++-----
 R/step_epi_shift.R                            |  5 +-
 R/time_types.R                                | 50 +++++++++++++------
 tests/testthat/test-layer_add_forecast_date.R |  1 -
 tests/testthat/test-layer_add_target_date.R   |  2 -
 7 files changed, 49 insertions(+), 36 deletions(-)

diff --git a/R/epi_workflow.R b/R/epi_workflow.R
index 0fca5dc6c..e64e0f7bc 100644
--- a/R/epi_workflow.R
+++ b/R/epi_workflow.R
@@ -253,7 +253,7 @@ predict.epi_workflow <- function(object, new_data, ...) {
   if (!workflows::is_trained_workflow(object)) {
     cli::cli_abort(c(
       "Can't predict on an untrained epi_workflow.",
-        i = "Do you need to call `fit()`?"
+      i = "Do you need to call `fit()`?"
     ))
   }
   components <- list()
diff --git a/R/layer_add_forecast_date.R b/R/layer_add_forecast_date.R
index 31196f51c..c02fc9173 100644
--- a/R/layer_add_forecast_date.R
+++ b/R/layer_add_forecast_date.R
@@ -97,7 +97,7 @@ slather.layer_add_forecast_date <- function(object, components, workflow, new_da
   expected_time_type <- attr(
     workflows::extract_preprocessor(workflow)$template, "metadata"
   )$time_type
-  if (expected_time_type == "week") expected_time_type = "day"
+  if (expected_time_type == "week") expected_time_type <- "day"
   check <- validate_date(object$forecast_date, expected_time_type)
 
   if (!check$ok) {
diff --git a/R/layer_add_target_date.R b/R/layer_add_target_date.R
index 6a9affc71..9034398ff 100644
--- a/R/layer_add_target_date.R
+++ b/R/layer_add_target_date.R
@@ -87,7 +87,7 @@ slather.layer_add_target_date <- function(object, components, workflow, new_data
   expected_time_type <- attr(
     workflows::extract_preprocessor(workflow)$template, "metadata"
   )$time_type
-  if (expected_time_type == "week") expected_time_type = "day"
+  if (expected_time_type == "week") expected_time_type <- "day"
 
   if (!is.null(object$target_date)) {
     check <- validate_date(object$target_date, expected_time_type)
@@ -101,10 +101,9 @@ slather.layer_add_target_date <- function(object, components, workflow, new_data
     target_date <- coerce_time_type(object$target_date, expected_time_type)
   } else if (
     detect_layer(the_frosting, "layer_add_forecast_date") &&
-    !is.null(possible_fd <- extract_argument(
-      the_frosting, "layer_add_forecast_date", "forecast_date"
-    ))) {
-
+      !is.null(possible_fd <- extract_argument(
+        the_frosting, "layer_add_forecast_date", "forecast_date"
+      ))) {
     check <- validate_date(possible_fd, expected_time_type)
     if (!check$ok) {
       cli::cli_abort(c(
@@ -117,13 +116,13 @@ slather.layer_add_target_date <- function(object, components, workflow, new_data
     ahead <- extract_argument(the_recipe, "step_epi_ahead", "ahead")
     target_date <- forecast_date + ahead
   } else {
-      max_time_value <- max(
-        workflows::extract_preprocessor(workflow)$max_time_value,
-        workflow$fit$meta$max_time_value,
-        max(new_data$time_value)
-      )
-      ahead <- extract_argument(the_recipe, "step_epi_ahead", "ahead")
-      target_date <- max_time_value + ahead
+    max_time_value <- max(
+      workflows::extract_preprocessor(workflow)$max_time_value,
+      workflow$fit$meta$max_time_value,
+      max(new_data$time_value)
+    )
+    ahead <- extract_argument(the_recipe, "step_epi_ahead", "ahead")
+    target_date <- max_time_value + ahead
   }
 
   components$predictions <- dplyr::bind_cols(components$predictions,
diff --git a/R/step_epi_shift.R b/R/step_epi_shift.R
index 94ef7178c..52f51de16 100644
--- a/R/step_epi_shift.R
+++ b/R/step_epi_shift.R
@@ -120,9 +120,8 @@ step_epi_ahead <-
     if (missing(ahead)) {
       cli::cli_abort(c(
         "The `ahead` argument must not be empty.",
-          i = "Did you perhaps pass an integer in `...` accidentally?"
-        )
-      )
+        i = "Did you perhaps pass an integer in `...` accidentally?"
+      ))
     }
     arg_is_nonneg_int(ahead)
     arg_is_chr_scalar(prefix, id)
diff --git a/R/time_types.R b/R/time_types.R
index 5f3aa4c14..0d1fa0662 100644
--- a/R/time_types.R
+++ b/R/time_types.R
@@ -3,23 +3,39 @@ guess_time_type <- function(time_value) {
   arg_is_scalar(time_value)
   if (is.character(time_value)) {
     if (nchar(time_value) <= "10") {
-      new_time_value <- tryCatch({
-        as.Date(time_value)
-      }, error = function(e) NULL)
+      new_time_value <- tryCatch(
+        {
+          as.Date(time_value)
+        },
+        error = function(e) NULL
+      )
     } else {
-      new_time_value <- tryCatch({
-        as.POSIXct(time_value)
-      }, error = function(e) NULL)
+      new_time_value <- tryCatch(
+        {
+          as.POSIXct(time_value)
+        },
+        error = function(e) NULL
+      )
     }
     if (!is.null(new_time_value)) time_value <- new_time_value
   }
-  if (inherits(time_value, "POSIXct")) return("day-time")
-  if (inherits(time_value, "Date")) return("day")
-  if (inherits(time_value, "yearweek")) return("yearweek")
-  if (inherits(time_value, "yearmonth")) return("yearmonth")
-  if (inherits(time_value, "yearquarter")) return("yearquarter")
+  if (inherits(time_value, "POSIXct")) {
+    return("day-time")
+  }
+  if (inherits(time_value, "Date")) {
+    return("day")
+  }
+  if (inherits(time_value, "yearweek")) {
+    return("yearweek")
+  }
+  if (inherits(time_value, "yearmonth")) {
+    return("yearmonth")
+  }
+  if (inherits(time_value, "yearquarter")) {
+    return("yearquarter")
+  }
   if (is.numeric(time_value) && all(time_value == as.integer(time_value)) &&
-      all(time_value >= 1582)) {
+    all(time_value >= 1582)) {
     return("year")
   }
   return("custom")
@@ -27,11 +43,13 @@ guess_time_type <- function(time_value) {
 
 coerce_time_type <- function(x, target_type) {
   if (target_type == "year") {
-    if (is.numeric(x)) return(as.integer(x))
-    else return(as.POSIXlt(x)$year + 1900L)
+    if (is.numeric(x)) {
+      return(as.integer(x))
+    } else {
+      return(as.POSIXlt(x)$year + 1900L)
+    }
   }
-  switch(
-    target_type,
+  switch(target_type,
     "day-time" = as.POSIXct(x),
     "day" = as.Date(x),
     "week" = as.Date(x),
diff --git a/tests/testthat/test-layer_add_forecast_date.R b/tests/testthat/test-layer_add_forecast_date.R
index 93f13baf7..698924af3 100644
--- a/tests/testthat/test-layer_add_forecast_date.R
+++ b/tests/testthat/test-layer_add_forecast_date.R
@@ -104,5 +104,4 @@ test_that("forecast date works for daily", {
     adjust_frosting(f, "layer_add_forecast_date", forecast_date = 2022L)
   )
   expect_error(predict(wf1, latest)) # wrong time type of forecast_date
-
 })
diff --git a/tests/testthat/test-layer_add_target_date.R b/tests/testthat/test-layer_add_target_date.R
index f3b706aa6..6316ec991 100644
--- a/tests/testthat/test-layer_add_target_date.R
+++ b/tests/testthat/test-layer_add_target_date.R
@@ -115,6 +115,4 @@ test_that("forecast date works for daily", {
     adjust_frosting(f, "layer_add_target_date", target_date = 2022L)
   )
   expect_error(predict(wf1, latest)) # wrong time type of forecast_date
-
 })
-

From 8feef06ded15745a3d0ae0be73841eaf8acc3eee Mon Sep 17 00:00:00 2001
From: Daniel McDonald <dajmcdon@gmail.com>
Date: Wed, 27 Mar 2024 11:12:23 -0700
Subject: [PATCH 19/25] Apply @dsweber2 suggestions from code review

Co-authored-by: David Weber <david.weber2@pm.me>
---
 tests/testthat/test-layer_add_forecast_date.R | 16 ++++++++++------
 tests/testthat/test-layer_add_target_date.R   |  2 +-
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/tests/testthat/test-layer_add_forecast_date.R b/tests/testthat/test-layer_add_forecast_date.R
index 698924af3..809644b73 100644
--- a/tests/testthat/test-layer_add_forecast_date.R
+++ b/tests/testthat/test-layer_add_forecast_date.R
@@ -84,24 +84,28 @@ test_that("forecast date works for daily", {
 
   wf1 <- add_frosting(wf, f)
   p <- predict(wf1, latest)
+  # both forecast_date and epi_df are dates
   expect_identical(p$forecast_date[1], as.Date("2021-12-31"))
 
-  latest_bad <- latest %>%
+  # forecast_date is a date while the epi_df uses years (ints)
+  latest_yearly <- latest %>%
     unclass() %>%
     as.data.frame() %>%
     mutate(time_value = as.POSIXlt(time_value)$year + 1900L) %>%
     as_epi_df()
+  expect_error(predict(wf1, latest_yearly))
 
-  expect_error(predict(wf1, latest_bad))
-  wf1 <- add_frosting(
+  # forecast_date is a string
+  wf2 <- add_frosting(
     wf,
     adjust_frosting(f, "layer_add_forecast_date", forecast_date = "2022-01-01")
   )
-  expect_silent(predict(wf1, latest))
+  expect_silent(predict(wf2, latest))
 
-  wf1 <- add_frosting(
+  # forecast_date is a year/int while the epi_df is a date
+  wf3 <- add_frosting(
     wf,
     adjust_frosting(f, "layer_add_forecast_date", forecast_date = 2022L)
   )
-  expect_error(predict(wf1, latest)) # wrong time type of forecast_date
+  expect_error(predict(wf3, latest))
 })
diff --git a/tests/testthat/test-layer_add_target_date.R b/tests/testthat/test-layer_add_target_date.R
index 6316ec991..999c07367 100644
--- a/tests/testthat/test-layer_add_target_date.R
+++ b/tests/testthat/test-layer_add_target_date.R
@@ -87,7 +87,7 @@ test_that("Specify own target date", {
   expect_named(p2, c("geo_value", "time_value", ".pred", "target_date"))
 })
 
-test_that("forecast date works for daily", {
+test_that("forecast date works for daily and yearly", {
   f <- frosting() %>%
     layer_predict() %>%
     layer_add_target_date() %>%

From 6d5c3791956db72c3458c5c06576a97eddc90495 Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <daniel@stat.ubc.ca>
Date: Wed, 27 Mar 2024 11:13:28 -0700
Subject: [PATCH 20/25] add back omitted test, include some comments

---
 tests/testthat/test-layer_add_forecast_date.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/testthat/test-layer_add_forecast_date.R b/tests/testthat/test-layer_add_forecast_date.R
index 698924af3..d452990e9 100644
--- a/tests/testthat/test-layer_add_forecast_date.R
+++ b/tests/testthat/test-layer_add_forecast_date.R
@@ -11,9 +11,9 @@ latest <- jhu %>%
 
 test_that("layer validation works", {
   f <- frosting()
-
-
-  expect_error(layer_add_forecast_date(f, "2022-05-31", id = 2))
+  expect_error(layer_add_forecast_date(f, c("2022-05-31", "2022-05-31"))) # multiple forecast_dates
+  expect_error(layer_add_forecast_date(f, "2022-05-31", id = 2)) # id is not a character
+  expect_error(layer_add_forecast_date(f, "2022-05-31", id = c("a", "b"))) # multiple ids
   expect_silent(layer_add_forecast_date(f, "2022-05-31"))
   expect_silent(layer_add_forecast_date(f))
   expect_silent(layer_add_forecast_date(f, as.Date("2022-05-31")))

From 047caa685e8a1038bc93afa178ff323207dd48ef Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <daniel@stat.ubc.ca>
Date: Wed, 27 Mar 2024 11:13:45 -0700
Subject: [PATCH 21/25] validate scalar forecast_date

---
 R/layer_add_forecast_date.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/layer_add_forecast_date.R b/R/layer_add_forecast_date.R
index c02fc9173..376194712 100644
--- a/R/layer_add_forecast_date.R
+++ b/R/layer_add_forecast_date.R
@@ -79,6 +79,7 @@ layer_add_forecast_date <-
 
 layer_add_forecast_date_new <- function(forecast_date, id) {
   arg_is_chr_scalar(id)
+  arg_is_scalar(forecast_date, allow_null = TRUE)
   # can't validate forecast_date until we know the time_type
   layer("add_forecast_date", forecast_date = forecast_date, id = id)
 }

From fc05cd93d9a32a6060ed02a301188364c453e165 Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <daniel@stat.ubc.ca>
Date: Wed, 27 Mar 2024 11:15:34 -0700
Subject: [PATCH 22/25] symmetrize scalar validation

---
 R/layer_add_forecast_date.R | 7 ++++---
 R/layer_add_target_date.R   | 3 ++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/R/layer_add_forecast_date.R b/R/layer_add_forecast_date.R
index 376194712..18d57337f 100644
--- a/R/layer_add_forecast_date.R
+++ b/R/layer_add_forecast_date.R
@@ -68,6 +68,9 @@
 #' p3
 layer_add_forecast_date <-
   function(frosting, forecast_date = NULL, id = rand_id("add_forecast_date")) {
+    arg_is_chr_scalar(id)
+    arg_is_scalar(forecast_date, allow_null = TRUE)
+    # can't validate the type of forecast_date until we know the time_type
     add_layer(
       frosting,
       layer_add_forecast_date_new(
@@ -78,9 +81,7 @@ layer_add_forecast_date <-
   }
 
 layer_add_forecast_date_new <- function(forecast_date, id) {
-  arg_is_chr_scalar(id)
-  arg_is_scalar(forecast_date, allow_null = TRUE)
-  # can't validate forecast_date until we know the time_type
+
   layer("add_forecast_date", forecast_date = forecast_date, id = id)
 }
 
diff --git a/R/layer_add_target_date.R b/R/layer_add_target_date.R
index 9034398ff..217a06069 100644
--- a/R/layer_add_target_date.R
+++ b/R/layer_add_target_date.R
@@ -63,8 +63,9 @@
 #' p3
 layer_add_target_date <-
   function(frosting, target_date = NULL, id = rand_id("add_target_date")) {
-    # can't validate target_date until we know the time_type
     arg_is_chr_scalar(id)
+    arg_is_scalar(target_date, allow_null = TRUE)
+    # can't validate the type of target_date until we know the time_type
     add_layer(
       frosting,
       layer_add_target_date_new(

From 1e76059401b996234fad022115b8f433967ff352 Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <daniel@stat.ubc.ca>
Date: Wed, 27 Mar 2024 11:18:47 -0700
Subject: [PATCH 23/25] add uncommitted stuff from code review

---
 tests/testthat/test-layer_add_target_date.R | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/testthat/test-layer_add_target_date.R b/tests/testthat/test-layer_add_target_date.R
index 999c07367..49049da7e 100644
--- a/tests/testthat/test-layer_add_target_date.R
+++ b/tests/testthat/test-layer_add_target_date.R
@@ -87,7 +87,7 @@ test_that("Specify own target date", {
   expect_named(p2, c("geo_value", "time_value", ".pred", "target_date"))
 })
 
-test_that("forecast date works for daily and yearly", {
+test_that("target date works for daily and yearly", {
   f <- frosting() %>%
     layer_predict() %>%
     layer_add_target_date() %>%
@@ -95,21 +95,25 @@ test_that("forecast date works for daily and yearly", {
 
   wf1 <- add_frosting(wf, f)
   p <- predict(wf1, latest)
+  # both target_date and epi_df are dates
   expect_identical(p$target_date[1], as.Date("2021-12-31") + 7L)
 
+  # target_date is a date while the epi_df uses years (ints)
   latest_bad <- latest %>%
     unclass() %>%
     as.data.frame() %>%
     mutate(time_value = as.POSIXlt(time_value)$year + 1900L) %>%
     as_epi_df()
-
   expect_error(predict(wf1, latest_bad))
+
+  # target_date is a string (gets correctly converted to Date)
   wf1 <- add_frosting(
     wf,
     adjust_frosting(f, "layer_add_target_date", target_date = "2022-01-07")
   )
   expect_silent(predict(wf1, latest))
 
+  # target_date is a year/int while the epi_df is a date
   wf1 <- add_frosting(
     wf,
     adjust_frosting(f, "layer_add_target_date", target_date = 2022L)

From f5ef88ba2f08c6851971c295205180d0ca20c3ed Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <daniel@stat.ubc.ca>
Date: Wed, 27 Mar 2024 13:37:49 -0700
Subject: [PATCH 24/25] clarify some test purposes

---
 tests/testthat/test-layer_add_forecast_date.R | 5 +++--
 tests/testthat/test-layer_add_target_date.R   | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/testthat/test-layer_add_forecast_date.R b/tests/testthat/test-layer_add_forecast_date.R
index 1e04a3c11..9595b47b6 100644
--- a/tests/testthat/test-layer_add_forecast_date.R
+++ b/tests/testthat/test-layer_add_forecast_date.R
@@ -87,7 +87,8 @@ test_that("forecast date works for daily", {
   # both forecast_date and epi_df are dates
   expect_identical(p$forecast_date[1], as.Date("2021-12-31"))
 
-  # forecast_date is a date while the epi_df uses years (ints)
+  # the error happens at predict time because the
+  # time_value train/test types don't match
   latest_yearly <- latest %>%
     unclass() %>%
     as.data.frame() %>%
@@ -95,7 +96,7 @@ test_that("forecast date works for daily", {
     as_epi_df()
   expect_error(predict(wf1, latest_yearly))
 
-  # forecast_date is a string
+  # forecast_date is a string, gets correctly converted to date
   wf2 <- add_frosting(
     wf,
     adjust_frosting(f, "layer_add_forecast_date", forecast_date = "2022-01-01")
diff --git a/tests/testthat/test-layer_add_target_date.R b/tests/testthat/test-layer_add_target_date.R
index 49049da7e..e5349839b 100644
--- a/tests/testthat/test-layer_add_target_date.R
+++ b/tests/testthat/test-layer_add_target_date.R
@@ -98,7 +98,8 @@ test_that("target date works for daily and yearly", {
   # both target_date and epi_df are dates
   expect_identical(p$target_date[1], as.Date("2021-12-31") + 7L)
 
-  # target_date is a date while the epi_df uses years (ints)
+  # the error happens at predict time because the
+  # time_value train/test types don't match
   latest_bad <- latest %>%
     unclass() %>%
     as.data.frame() %>%

From c1e3ff9c2a70bc018fe1232f6845270512aa8815 Mon Sep 17 00:00:00 2001
From: "Daniel J. McDonald" <daniel@stat.ubc.ca>
Date: Wed, 27 Mar 2024 13:38:21 -0700
Subject: [PATCH 25/25] refactor validate_date() to avoid duplication

---
 R/layer_add_forecast_date.R | 23 ++++++++++-------------
 R/layer_add_target_date.R   | 30 +++++++++++-------------------
 R/time_types.R              | 15 +++++++++++----
 3 files changed, 32 insertions(+), 36 deletions(-)

diff --git a/R/layer_add_forecast_date.R b/R/layer_add_forecast_date.R
index 18d57337f..5bd6b6918 100644
--- a/R/layer_add_forecast_date.R
+++ b/R/layer_add_forecast_date.R
@@ -81,7 +81,6 @@ layer_add_forecast_date <-
   }
 
 layer_add_forecast_date_new <- function(forecast_date, id) {
-
   layer("add_forecast_date", forecast_date = forecast_date, id = id)
 }
 
@@ -93,27 +92,25 @@ slather.layer_add_forecast_date <- function(object, components, workflow, new_da
       workflow$fit$meta$max_time_value,
       max(new_data$time_value)
     )
-    object$forecast_date <- max_time_value
+    forecast_date <- max_time_value
+  } else {
+    forecast_date <- object$forecast_date
   }
 
   expected_time_type <- attr(
     workflows::extract_preprocessor(workflow)$template, "metadata"
   )$time_type
   if (expected_time_type == "week") expected_time_type <- "day"
-  check <- validate_date(object$forecast_date, expected_time_type)
-
-  if (!check$ok) {
-    cli::cli_abort(c(
-      "The `forecast_date` was given as a {.val {check$x}} while the",
-      `!` = "`time_type` of the training data was {.val {check$expected}}.",
-      i = "See {.topic epiprocess::epi_df} for how these are determined."
-    ))
-  }
-
+  validate_date(forecast_date, expected_time_type,
+    call = expr(layer_add_forecast_date())
+  )
+  forecast_date <- coerce_time_type(forecast_date, expected_time_type)
+  object$forecast_date <- forecast_date
   components$predictions <- dplyr::bind_cols(
     components$predictions,
-    forecast_date = coerce_time_type(object$forecast_date, expected_time_type)
+    forecast_date = forecast_date
   )
+
   components
 }
 
diff --git a/R/layer_add_target_date.R b/R/layer_add_target_date.R
index 217a06069..a50b6042c 100644
--- a/R/layer_add_target_date.R
+++ b/R/layer_add_target_date.R
@@ -91,29 +91,20 @@ slather.layer_add_target_date <- function(object, components, workflow, new_data
   if (expected_time_type == "week") expected_time_type <- "day"
 
   if (!is.null(object$target_date)) {
-    check <- validate_date(object$target_date, expected_time_type)
-    if (!check$ok) {
-      cli::cli_abort(c(
-        "The `target_date` was given as a {.val {check$x}} while the",
-        `!` = "`time_type` of the training data was {.val {check$expected}}.",
-        i = "See {.topic epiprocess::epi_df} for descriptions of these are determined."
-      ))
-    }
-    target_date <- coerce_time_type(object$target_date, expected_time_type)
+    target_date <- object$target_date
+    validate_date(target_date, expected_time_type,
+      call = expr(layer_add_target_date())
+    )
+    target_date <- coerce_time_type(target_date, expected_time_type)
   } else if (
     detect_layer(the_frosting, "layer_add_forecast_date") &&
-      !is.null(possible_fd <- extract_argument(
+      !is.null(forecast_date <- extract_argument(
         the_frosting, "layer_add_forecast_date", "forecast_date"
       ))) {
-    check <- validate_date(possible_fd, expected_time_type)
-    if (!check$ok) {
-      cli::cli_abort(c(
-        "The `forecast_date` was given as a {.val {check$x}} while the",
-        `!` = "`time_type` of the training data was {.val {check$expected}}.",
-        i = "See {.topic epiprocess::epi_df} for how these are determined."
-      ))
-    }
-    forecast_date <- coerce_time_type(possible_fd, expected_time_type)
+    validate_date(forecast_date, expected_time_type,
+      call = expr(layer_add_forecast_date())
+    )
+    forecast_date <- coerce_time_type(forecast_date, expected_time_type)
     ahead <- extract_argument(the_recipe, "step_epi_ahead", "ahead")
     target_date <- forecast_date + ahead
   } else {
@@ -126,6 +117,7 @@ slather.layer_add_target_date <- function(object, components, workflow, new_data
     target_date <- max_time_value + ahead
   }
 
+  object$target_date <- target_date
   components$predictions <- dplyr::bind_cols(components$predictions,
     target_date = target_date
   )
diff --git a/R/time_types.R b/R/time_types.R
index 0d1fa0662..7fe3e47b4 100644
--- a/R/time_types.R
+++ b/R/time_types.R
@@ -59,8 +59,15 @@ coerce_time_type <- function(x, target_type) {
   )
 }
 
-validate_date <- function(x, expected) {
-  x <- guess_time_type(x)
-  ok <- x == expected
-  enlist(ok, x, expected)
+validate_date <- function(x, expected, arg = rlang::caller_arg(x),
+                          call = rlang::caller_env()) {
+  time_type_x <- guess_time_type(x)
+  ok <- time_type_x == expected
+  if (!ok) {
+    cli::cli_abort(c(
+      "The {.arg {arg}} was given as a {.val {time_type_x}} while the",
+      `!` = "`time_type` of the training data was {.val {expected}}.",
+      i = "See {.topic epiprocess::epi_df} for descriptions of these are determined."
+    ), call = call)
+  }
 }