From 01f22621f16ed593faf733175878feea73f9d5f3 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 21 Mar 2024 15:45:02 -0700 Subject: [PATCH 01/18] wip+doc: add S3 implementation of epi_archive * remove comment #417 * bump version to 0.7.6 and add NEWS line --- DESCRIPTION | 3 + NAMESPACE | 27 + NEWS.md | 1 + R/archive.R | 3 - R/archive_new.R | 1115 +++++++++++++++++ R/data.R | 8 + R/grouped_archive_new.R | 456 +++++++ R/methods-epi_archive.R | 3 +- R/methods-epi_archive_new.R | 826 ++++++++++++ man/as_epi_archive2.Rd | 142 +++ man/as_of.epi_archive2.Rd | 33 + man/epi_archive.Rd | 106 +- man/epix_as_of2.Rd | 95 ++ man/epix_fill_through_version2.Rd | 48 + man/epix_merge2.Rd | 71 ++ man/epix_slide2.Rd | 283 +++++ man/epix_truncate_versions_after.Rd | 10 +- ...ate_versions_after.grouped_epi_archive2.Rd | 11 + man/fill_through_version.epi_archive2.Rd | 21 + man/group_by.epi_archive.Rd | 23 +- man/group_by.epi_archive2.Rd | 147 +++ man/is_epi_archive2.Rd | 35 + man/max_version_with_row_in.Rd | 9 +- man/merge_epi_archive2.Rd | 30 + man/new_epi_archive2.Rd | 69 + man/next_after.Rd | 8 +- man/print.epi_archive2.Rd | 17 + man/slide.epi_archive2.Rd | 101 ++ man/slide.grouped_epi_archive2.Rd | 24 + man/truncate_versions_after.epi_archive2.Rd | 19 + ...ate_versions_after.grouped_epi_archive2.Rd | 18 + tests/testthat/test-archive_new.R | 173 +++ tests/testthat/test-compactify.R | 2 +- tests/testthat/test-compactify_new.R | 110 ++ .../test-epix_fill_through_version_new.R | 109 ++ tests/testthat/test-epix_merge_new.R | 226 ++++ tests/testthat/test-epix_slide_new.R | 810 ++++++++++++ tests/testthat/test-grouped_epi_archive_new.R | 104 ++ tests/testthat/test-methods-epi_archive_new.R | 136 ++ vignettes/advanced.Rmd | 27 +- vignettes/archive.Rmd | 24 +- vignettes/compactify.Rmd | 10 +- 42 files changed, 5452 insertions(+), 41 deletions(-) create mode 100644 R/archive_new.R create mode 100644 R/grouped_archive_new.R create mode 100644 R/methods-epi_archive_new.R create mode 100644 man/as_epi_archive2.Rd create mode 100644 man/as_of.epi_archive2.Rd create mode 100644 man/epix_as_of2.Rd create mode 100644 man/epix_fill_through_version2.Rd create mode 100644 man/epix_merge2.Rd create mode 100644 man/epix_slide2.Rd create mode 100644 man/epix_truncate_versions_after.grouped_epi_archive2.Rd create mode 100644 man/fill_through_version.epi_archive2.Rd create mode 100644 man/group_by.epi_archive2.Rd create mode 100644 man/is_epi_archive2.Rd create mode 100644 man/merge_epi_archive2.Rd create mode 100644 man/new_epi_archive2.Rd create mode 100644 man/print.epi_archive2.Rd create mode 100644 man/slide.epi_archive2.Rd create mode 100644 man/slide.grouped_epi_archive2.Rd create mode 100644 man/truncate_versions_after.epi_archive2.Rd create mode 100644 man/truncate_versions_after.grouped_epi_archive2.Rd create mode 100644 tests/testthat/test-archive_new.R create mode 100644 tests/testthat/test-compactify_new.R create mode 100644 tests/testthat/test-epix_fill_through_version_new.R create mode 100644 tests/testthat/test-epix_merge_new.R create mode 100644 tests/testthat/test-epix_slide_new.R create mode 100644 tests/testthat/test-grouped_epi_archive_new.R create mode 100644 tests/testthat/test-methods-epi_archive_new.R diff --git a/DESCRIPTION b/DESCRIPTION index 3c6bbb16..cfdd9f49 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -73,6 +73,7 @@ Depends: URL: https://cmu-delphi.github.io/epiprocess/ Collate: 'archive.R' + 'archive_new.R' 'autoplot.R' 'correlation.R' 'data.R' @@ -80,9 +81,11 @@ Collate: 'epiprocess.R' 'group_by_epi_df_methods.R' 'methods-epi_archive.R' + 'grouped_archive_new.R' 'grouped_epi_archive.R' 'growth_rate.R' 'key_colnames.R' + 'methods-epi_archive_new.R' 'methods-epi_df.R' 'outliers.R' 'reexports.R' diff --git a/NAMESPACE b/NAMESPACE index 18bc6fc6..d5d1cd7b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,40 +6,58 @@ S3method(as_epi_df,data.frame) S3method(as_epi_df,epi_df) S3method(as_epi_df,tbl_df) S3method(as_epi_df,tbl_ts) +S3method(as_of,epi_archive2) S3method(as_tibble,epi_df) S3method(as_tsibble,epi_df) S3method(autoplot,epi_df) +S3method(clone,epi_archive2) +S3method(clone,grouped_epi_archive2) S3method(dplyr_col_modify,col_modify_recorder_df) S3method(dplyr_col_modify,epi_df) S3method(dplyr_reconstruct,epi_df) S3method(dplyr_row_slice,epi_df) S3method(epix_truncate_versions_after,epi_archive) +S3method(epix_truncate_versions_after,epi_archive2) S3method(epix_truncate_versions_after,grouped_epi_archive) +S3method(epix_truncate_versions_after,grouped_epi_archive2) S3method(group_by,epi_archive) +S3method(group_by,epi_archive2) S3method(group_by,epi_df) S3method(group_by,grouped_epi_archive) +S3method(group_by,grouped_epi_archive2) S3method(group_by_drop_default,grouped_epi_archive) +S3method(group_by_drop_default,grouped_epi_archive2) S3method(group_modify,epi_df) S3method(groups,grouped_epi_archive) +S3method(groups,grouped_epi_archive2) S3method(key_colnames,data.frame) S3method(key_colnames,default) S3method(key_colnames,epi_archive) S3method(key_colnames,epi_df) S3method(next_after,Date) S3method(next_after,integer) +S3method(print,epi_archive2) S3method(print,epi_df) +S3method(print,grouped_epi_archive2) S3method(select,epi_df) +S3method(slide,grouped_epi_archive2) S3method(summary,epi_df) +S3method(truncate_versions_after,grouped_epi_archive2) S3method(ungroup,epi_df) S3method(ungroup,grouped_epi_archive) +S3method(ungroup,grouped_epi_archive2) S3method(unnest,epi_df) export("%>%") export(archive_cases_dv_subset) +export(archive_cases_dv_subset_2) export(arrange) export(as_epi_archive) +export(as_epi_archive2) export(as_epi_df) +export(as_of) export(as_tsibble) export(autoplot) +export(clone) export(detect_outlr) export(detect_outlr_rm) export(detect_outlr_stl) @@ -50,24 +68,33 @@ export(epi_slide_mean) export(epi_slide_opt) export(epi_slide_sum) export(epix_as_of) +export(epix_as_of2) export(epix_merge) +export(epix_merge2) export(epix_slide) +export(epix_slide2) export(epix_truncate_versions_after) +export(fill_through_version) export(filter) export(group_by) export(group_modify) export(growth_rate) export(is_epi_archive) +export(is_epi_archive2) export(is_epi_df) export(is_grouped_epi_archive) +export(is_grouped_epi_archive2) export(key_colnames) export(max_version_with_row_in) export(mutate) +export(new_epi_archive2) export(new_epi_df) export(next_after) export(relocate) export(rename) export(slice) +export(slide) +export(truncate_versions_after) export(ungroup) export(unnest) importFrom(R6,R6Class) diff --git a/NEWS.md b/NEWS.md index b9b2f60b..e2c5b8e4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -32,6 +32,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat ## Breaking changes - Switched `epi_df`'s `other_keys` default from `NULL` to `character(0)`; PR #390 +- Refactor `epi_archive` to use S3 instead of R6 for its object model. The calls to some functions will change, but the functionality will remain the same. It will also help us maintain the package better in the future. (#340) # epiprocess 0.7.0 diff --git a/R/archive.R b/R/archive.R index ff3bc20c..a530cc05 100644 --- a/R/archive.R +++ b/R/archive.R @@ -514,9 +514,6 @@ epi_archive <- fromLast = TRUE ) %>% tibble::as_tibble() %>% - # (`as_tibble` should de-alias the DT and its columns in any edge - # cases where they are aliased. We don't say we guarantee this - # though.) dplyr::select(-"version") %>% as_epi_df( geo_type = self$geo_type, diff --git a/R/archive_new.R b/R/archive_new.R new file mode 100644 index 00000000..0b4f3695 --- /dev/null +++ b/R/archive_new.R @@ -0,0 +1,1115 @@ +# We use special features of data.table's `[`. The data.table package has a +# compatibility feature that disables some/all of these features if it thinks we +# might expect `data.frame`-compatible behavior instead. We can signal that we +# want the special behavior via `.datatable.aware = TRUE` or by importing any +# `data.table` package member. Do both to prevent surprises if we decide to use +# `data.table::` everywhere and not importing things. +.datatable.aware <- TRUE + +#' Validate a version bound arg +#' +#' Expected to be used on `clobberable_versions_start`, `versions_end`, +#' and similar arguments. Some additional context-specific checks may be needed. +#' +#' @param version_bound the version bound to validate +#' @param x a data frame containing a version column with which to check +#' compatibility +#' @param na_ok Boolean; is `NA` an acceptable "bound"? (If so, `NA` will +#' have a special context-dependent meaning.) +#' @param version_bound_arg optional string; what to call the version bound in +#' error messages +#' +#' @section Side effects: raises an error if version bound appears invalid +#' +#' @noRd +validate_version_bound <- function(version_bound, x, na_ok = FALSE, + version_bound_arg = rlang::caller_arg(version_bound), + x_arg = rlang::caller_arg(version_bound)) { + if (is.null(version_bound)) { + cli_abort( + "{version_bound_arg} cannot be NULL" + ) + } + if (na_ok && is.na(version_bound)) { + return(invisible(NULL)) + } + if (!test_set_equal(class(version_bound), class(x[["version"]]))) { + cli_abort( + "{version_bound_arg} must have the same classes as x$version, + which is {class(x$version)}", + ) + } + if (!test_set_equal(typeof(version_bound), typeof(x[["version"]]))) { + cli_abort( + "{version_bound_arg} must have the same types as x$version, + which is {typeof(x$version)}", + ) + } + + return(invisible(NULL)) +} + +#' `max(x$version)`, with error if `x` has 0 rows +#' +#' Exported to make defaults more easily copyable. +#' +#' @param x `x` argument of [`as_epi_archive`] +#' +#' @return `max(x$version)` if it has any rows; raises error if it has 0 rows or +#' an `NA` version value +#' +#' @export +max_version_with_row_in <- function(x) { + if (nrow(x) == 0L) { + cli_abort( + "`nrow(x)==0L`, representing a data set history with no row up through the + latest observed version, but we don't have a sensible guess at what version + that is, or whether any of the empty versions might be clobbered in the + future; if we use `x` to form an `epi_archive`, then + `clobberable_versions_start` and `versions_end` must be manually specified.", + class = "epiprocess__max_version_cannot_be_used" + ) + } else { + version_col <- purrr::pluck(x, "version") # error not NULL if doesn't exist + if (anyNA(version_col)) { + cli_abort("version values cannot be NA", + class = "epiprocess__version_values_must_not_be_na" + ) + } else { + version_bound <- max(version_col) + } + } +} + +#' Get the next possible value greater than `x` of the same type +#' +#' @param x the starting "value"(s) +#' @return same class, typeof, and length as `x` +#' +#' @export +next_after <- function(x) UseMethod("next_after") + +#' @export +next_after.integer <- function(x) x + 1L + +#' @export +next_after.Date <- function(x) x + 1L + + + +#' epi archive +#' @title `epi_archive` object +#' +#' @description An `epi_archive` is an R6 class which contains a data table +#' along with several relevant pieces of metadata. The data table can be seen +#' as the full archive (version history) for some signal variables of +#' interest. +#' +#' @details An `epi_archive` is an R6 class which contains a data table `DT`, of +#' class `data.table` from the `data.table` package, with (at least) the +#' following columns: +#' +#' * `geo_value`: the geographic value associated with each row of measurements. +#' * `time_value`: the time value associated with each row of measurements. +#' * `version`: the time value specifying the version for each row of +#' measurements. For example, if in a given row the `version` is January 15, +#' 2022 and `time_value` is January 14, 2022, then this row contains the +#' measurements of the data for January 14, 2022 that were available one day +#' later. +#' +#' The data table `DT` has key variables `geo_value`, `time_value`, `version`, +#' as well as any others (these can be specified when instantiating the +#' `epi_archive` object via the `other_keys` argument, and/or set by operating +#' on `DT` directly). Refer to the documentation for [as_epi_archive()] for +#' information and examples of relevant parameter names for an `epi_archive` object. +#' Note that there can only be a single row per unique combination of +#' key variables, and thus the key variables are critical for figuring out how +#' to generate a snapshot of data from the archive, as of a given version. +#' +#' In general, the last version of each observation is carried forward (LOCF) to +#' fill in data between recorded versions, and between the last recorded +#' update and the `versions_end`. One consequence is that the `DT` +#' doesn't have to contain a full snapshot of every version (although this +#' generally works), but can instead contain only the rows that are new or +#' changed from the previous version (see `compactify`, which does this +#' automatically). Currently, deletions must be represented as revising a row +#' to a special state (e.g., making the entries `NA` or including a special +#' column that flags the data as removed and performing some kind of +#' post-processing), and the archive is unaware of what this state is. Note +#' that `NA`s *can* be introduced by `epi_archive` methods for other reasons, +#' e.g., in [`epix_fill_through_version`] and [`epix_merge`], if requested, to +#' represent potential update data that we do not yet have access to; or in +#' [`epix_merge`] to represent the "value" of an observation before the +#' version in which it was first released, or if no version of that +#' observation appears in the archive data at all. +#' +#' **A word of caution:** R6 objects, unlike most other objects in R, have +#' reference semantics. A primary consequence of this is that objects are not +#' copied when modified. You can read more about this in Hadley Wickham's +#' [Advanced R](https://adv-r.hadley.nz/r6.html#r6-semantics) book. In order +#' to construct a modified archive while keeping the original intact, first +#' make a clone using the `$clone` method, then overwrite the clone's `DT` +#' field with `data.table::copy(clone$DT)`, and finally perform the +#' modifications on the clone. +#' +#' @section Metadata: +#' The following pieces of metadata are included as fields in an `epi_archive` +#' object: +#' +#' * `geo_type`: the type for the geo values. +#' * `time_type`: the type for the time values. +#' * `additional_metadata`: list of additional metadata for the data archive. +#' +#' Unlike an `epi_df` object, metadata for an `epi_archive` object `x` can be +#' accessed (and altered) directly, as in `x$geo_type` or `x$time_type`, +#' etc. Like an `epi_df` object, the `geo_type` and `time_type` fields in the +#' metadata of an `epi_archive` object are not currently used by any +#' downstream functions in the `epiprocess` package, and serve only as useful +#' bits of information to convey about the data set at hand. +#' +#' @section Generating Snapshots: +#' An `epi_archive` object can be used to generate a snapshot of the data in +#' `epi_df` format, which represents the most up-to-date values of the signal +#' variables, as of the specified version. This is accomplished by calling the +#' `as_of()` method for an `epi_archive` object `x`. More details on this +#' method are documented in the wrapper function [`epix_as_of()`]. +#' +#' @section Sliding Computations: +#' We can run a sliding computation over an `epi_archive` object, much like +#' `epi_slide()` does for an `epi_df` object. This is accomplished by calling +#' the `slide()` method for an `epi_archive` object, which works similarly to +#' the way `epi_slide()` works for an `epi_df` object, but with one key +#' difference: it is version-aware. That is, for an `epi_archive` object, the +#' sliding computation at any given reference time point t is performed on +#' **data that would have been available as of t**. More details on `slide()` +#' are documented in the wrapper function [`epix_slide()`]. +#' +#' @export +#' @examples +#' tib <- tibble::tibble( +#' geo_value = rep(c("ca", "hi"), each = 5), +#' time_value = rep(seq(as.Date("2020-01-01"), +#' by = 1, length.out = 5 +#' ), times = 2), +#' version = rep(seq(as.Date("2020-01-02"), +#' by = 1, length.out = 5 +#' ), times = 2), +#' value = rnorm(10, mean = 2, sd = 1) +#' ) +#' +#' toy_epi_archive <- tib %>% new_epi_archive2( +#' geo_type = "state", +#' time_type = "day" +#' ) +#' toy_epi_archive +#' @name epi_archive +# TODO: Figure out where to actually put this documentation +NULL + +#' New epi archive +#' @description Creates a new `epi_archive` object. +#' @param x A data.frame, data.table, or tibble, with columns `geo_value`, +#' `time_value`, `version`, and then any additional number of columns. +#' @param geo_type Type for the geo values. If missing, then the function will +#' attempt to infer it from the geo values present; if this fails, then it +#' will be set to "custom". +#' @param time_type Type for the time values. If missing, then the function will +#' attempt to infer it from the time values present; if this fails, then it +#' will be set to "custom". +#' @param other_keys Character vector specifying the names of variables in `x` +#' that should be considered key variables (in the language of `data.table`) +#' apart from "geo_value", "time_value", and "version". +#' @param additional_metadata List of additional metadata to attach to the +#' `epi_archive` object. The metadata will have `geo_type` and `time_type` +#' fields; named entries from the passed list or will be included as well. +#' @param compactify Optional; Boolean or `NULL`: should we remove rows that are +#' considered redundant for the purposes of `epi_archive`'s built-in methods +#' such as `as_of`? As these methods use the last version of each observation +#' carried forward (LOCF) to interpolate between the version data provided, +#' rows that don't change these LOCF results can potentially be omitted to +#' save space while maintaining the same behavior (with the help of the +#' `clobberable_versions_start` and `versions_end` fields in some edge cases). +#' `TRUE` will remove these rows, `FALSE` will not, and missing or `NULL` will +#' remove these rows and issue a warning. Generally, this can be set to +#' `TRUE`, but if you directly inspect or edit the fields of the `epi_archive` +#' such as its `DT`, or rely on redundant updates to achieve a certain +#' behavior of the `ref_time_values` default in `epix_slide`, you will have to +#' determine whether `compactify=TRUE` will produce the desired results. If +#' compactification here is removing a large proportion of the rows, this may +#' indicate a potential for space, time, or bandwidth savings upstream the +#' data pipeline, e.g., by avoiding fetching, storing, or processing these +#' rows of `x`. +#' @param clobberable_versions_start Optional; as in [`as_epi_archive`] +#' @param versions_end Optional; as in [`as_epi_archive`] +#' @return An `epi_archive` object. +#' @importFrom data.table as.data.table key setkeyv +#' +#' @details +#' Refer to the documentation for [as_epi_archive()] for more information +#' and examples of parameter names. +#' @export +new_epi_archive2 <- function( + x, + geo_type = NULL, + time_type = NULL, + other_keys = NULL, + additional_metadata = NULL, + compactify = NULL, + clobberable_versions_start = NA, + versions_end = NULL) { + assert_data_frame(x) + if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { + cli_abort( + "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + ) + } + if (anyMissing(x$version)) { + cli_abort("Column `version` must not contain missing values.") + } + + # If geo type is missing, then try to guess it + if (missing(geo_type) || is.null(geo_type)) { + geo_type <- guess_geo_type(x$geo_value) + } + + # If time type is missing, then try to guess it + if (missing(time_type) || is.null(time_type)) { + time_type <- guess_time_type(x$time_value) + } + + # Finish off with small checks on keys variables and metadata + if (missing(other_keys)) other_keys <- NULL + if (missing(additional_metadata) || is.null(additional_metadata)) additional_metadata <- list() + if (!test_subset(other_keys, names(x))) { + cli_abort("`other_keys` must be contained in the column names of `x`.") + } + if (any(c("geo_value", "time_value", "version") %in% other_keys)) { + cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") + } + if (any(names(additional_metadata) %in% c("geo_type", "time_type"))) { + cli_warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") + } + + # Conduct checks and apply defaults for `compactify` + if (missing(compactify)) { + compactify <- NULL + } + assert_logical(compactify, len = 1, null.ok = TRUE) + + # Apply defaults and conduct checks for + # `clobberable_versions_start`, `versions_end`: + if (missing(clobberable_versions_start)) { + clobberable_versions_start <- NA + } + if (missing(versions_end) || is.null(versions_end)) { + versions_end <- max_version_with_row_in(x) + } + validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) + validate_version_bound(versions_end, x, na_ok = FALSE) + if (nrow(x) > 0L && versions_end < max(x[["version"]])) { + cli_abort( + sprintf( + "`versions_end` was %s, but `x` contained + updates for a later version or versions, up through %s", + versions_end, max(x[["version"]]) + ), + class = "epiprocess__versions_end_earlier_than_updates" + ) + } + if (!is.na(clobberable_versions_start) && clobberable_versions_start > versions_end) { + cli_abort( + sprintf( + "`versions_end` was %s, but a `clobberable_versions_start` + of %s indicated that there were later observed versions", + versions_end, clobberable_versions_start + ), + class = "epiprocess__versions_end_earlier_than_clobberable_versions_start" + ) + } + + # --- End of validation and replacing missing args with defaults --- + + # Create the data table; if x was an un-keyed data.table itself, + # then the call to as.data.table() will fail to set keys, so we + # need to check this, then do it manually if needed + key_vars <- c("geo_value", "time_value", other_keys, "version") + DT <- as.data.table(x, key = key_vars) + if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) + + maybe_first_duplicate_key_row_index <- anyDuplicated(DT, by = key(DT)) + if (maybe_first_duplicate_key_row_index != 0L) { + cli_abort("`x` must have one row per unique combination of the key variables. If you + have additional key variables other than `geo_value`, `time_value`, and + `version`, such as an age group column, please specify them in `other_keys`. + Otherwise, check for duplicate rows and/or conflicting values for the same + measurement.", + class = "epiprocess__epi_archive_requires_unique_key" + ) + } + + # Checks to see if a value in a vector is LOCF + is_locf <- function(vec) { + dplyr::if_else(!is.na(vec) & !is.na(dplyr::lag(vec)), + vec == dplyr::lag(vec), + is.na(vec) & is.na(dplyr::lag(vec)) + ) + } + + # LOCF is defined by a row where all values except for the version + # differ from their respective lag values + + # Checks for LOCF's in a data frame + rm_locf <- function(df) { + dplyr::filter(df, if_any(c(everything(), -version), ~ !is_locf(.))) + } + + # Keeps LOCF values, such as to be printed + keep_locf <- function(df) { + dplyr::filter(df, if_all(c(everything(), -version), ~ is_locf(.))) + } + + # Runs compactify on data frame + if (is.null(compactify) || compactify == TRUE) { + elim <- keep_locf(DT) + DT <- rm_locf(DT) + } else { + # Create empty data frame for nrow(elim) to be 0 + elim <- tibble::tibble() + } + + # Warns about redundant rows + if (is.null(compactify) && nrow(elim) > 0) { + warning_intro <- cli::format_inline( + "Found rows that appear redundant based on + last (version of each) observation carried forward; + these rows have been removed to 'compactify' and save space:", + keep_whitespace = FALSE + ) + warning_data <- paste(collapse = "\n", capture.output(print(elim, topn = 3L, nrows = 7L))) + warning_outro <- cli::format_inline( + "Built-in `epi_archive` functionality should be unaffected, + but results may change if you work directly with its fields (such as `DT`). + See `?as_epi_archive` for details. + To silence this warning but keep compactification, + you can pass `compactify=TRUE` when constructing the archive.", + keep_whitespace = FALSE + ) + warning_message <- paste(sep = "\n", warning_intro, warning_data, warning_outro) + rlang::warn(warning_message, class = "epiprocess__compactify_default_removed_rows") + } + + structure( + list( + DT = DT, + geo_type = geo_type, + time_type = time_type, + additional_metadata = additional_metadata, + clobberable_versions_start = clobberable_versions_start, + versions_end = versions_end, + private = list() # TODO: to be encapsulated with guard-rails later + ), + class = "epi_archive2" + ) +} + +#' Print information about an `epi_archive` object +#' @param class Boolean; whether to print the class label header +#' @param methods Boolean; whether to print all available methods of +#' the archive +#' @importFrom cli cli_inform +#' @export +print.epi_archive2 <- function(epi_archive, class = TRUE, methods = TRUE) { + cli_inform( + c( + ">" = if (class) "An `epi_archive` object, with metadata:", + "i" = if (length(setdiff(key(epi_archive$DT), c("geo_value", "time_value", "version"))) > 0) { + "Non-standard DT keys: {setdiff(key(epi_archive$DT), c('geo_value', 'time_value', 'version'))}" + }, + "i" = "Min/max time values: {min(epi_archive$DT$time_value)} / {max(epi_archive$DT$time_value)}", + "i" = "First/last version with update: {min(epi_archive$DT$version)} / {max(epi_archive$DT$version)}", + "i" = if (!is.na(epi_archive$clobberable_versions_start)) { + "Clobberable versions start: {epi_archive$clobberable_versions_start}" + }, + "i" = "Versions end: {epi_archive$versions_end}", + "i" = if (methods) "Public R6 methods: {names(epi_archive$public_methods)}", + "i" = "A preview of the table ({nrow(epi_archive$DT)} rows x {ncol(epi_archive$DT)} columns):" + ) + ) + + return(invisible(epi_archive$DT %>% print())) +} + + +#' @export +as_of <- function(x, ...) { + UseMethod("as_of") +} + + +#' As of epi_archive +#' @description Generates a snapshot in `epi_df` format as of a given version. +#' See the documentation for the wrapper function [`epix_as_of()`] for +#' details. The parameter descriptions below are copied from there +#' @param epi_archive An `epi_archive` object +#' @param max_version Version specifying the max version to permit in the +#' snapshot. That is, the snapshot will comprise the unique rows of the +#' current archive data that represent the most up-to-date signal values, as +#' of the specified `max_version` (and whose `time_value`s are at least +#' `min_time_value`). +#' @param min_time_value Time value specifying the min `time_value` to permit in +#' the snapshot. Default is `-Inf`, which effectively means that there is no +#' minimum considered. +#' @param all_versions Boolean; If `all_versions = TRUE`, then the output will be in +#' `epi_archive` format, and contain rows in the specified `time_value` range +#' having `version <= max_version`. The resulting object will cover a +#' potentially narrower `version` and `time_value` range than `x`, depending +#' on user-provided arguments. Otherwise, there will be one row in the output +#' for the `max_version` of each `time_value`. Default is `FALSE`. +#' @importFrom data.table between key +#' @export +as_of.epi_archive2 <- function(epi_archive, max_version, min_time_value = -Inf, all_versions = FALSE) { + other_keys <- setdiff( + key(epi_archive$DT), + c("geo_value", "time_value", "version") + ) + if (length(other_keys) == 0) other_keys <- NULL + + # Check a few things on max_version + if (!test_set_equal(class(max_version), class(epi_archive$DT$version))) { + cli_abort( + "`max_version` must have the same classes as `epi_archive$DT$version`." + ) + } + if (!test_set_equal(typeof(max_version), typeof(epi_archive$DT$version))) { + cli_abort( + "`max_version` must have the same types as `epi_archive$DT$version`." + ) + } + assert_scalar(max_version, na.ok = FALSE) + if (max_version > epi_archive$versions_end) { + cli_abort("`max_version` must be at most `epi_archive$versions_end`.") + } + assert_logical(all_versions, len = 1) + if (!is.na(epi_archive$clobberable_versions_start) && max_version >= epi_archive$clobberable_versions_start) { + cli_warn( + 'Getting data as of some recent version which could still be + overwritten (under routine circumstances) without assigning a new + version number (a.k.a. "clobbered"). Thus, the snapshot that we + produce here should not be expected to be reproducible later. See + `?epi_archive` for more info and `?epix_as_of` on how to muffle.', + class = "epiprocess__snapshot_as_of_clobberable_version" + ) + } + + # Filter by version and return + if (all_versions) { + # epi_archive is copied into result, so we can modify result directly + result <- epix_truncate_versions_after(epi_archive, max_version) + result$DT <- result$DT[time_value >= min_time_value, ] + return(result) + } + + # Make sure to use data.table ways of filtering and selecting + as_of_epi_df <- epi_archive$DT[time_value >= min_time_value & version <= max_version, ] %>% + unique( + by = c("geo_value", "time_value", other_keys), + fromLast = TRUE + ) %>% + tibble::as_tibble() %>% + dplyr::select(-"version") %>% + as_epi_df( + geo_type = epi_archive$geo_type, + time_type = epi_archive$time_type, + as_of = max_version, + additional_metadata = c(epi_archive$additional_metadata, + other_keys = other_keys + ) + ) + + return(as_of_epi_df) +} + + +#' @export +fill_through_version <- function(x, ...) { + UseMethod("fill_through_version") +} + + +#' Fill through version +#' @description Fill in unobserved history using requested scheme by mutating +#' the given object and potentially reseating its fields. See +#' [`epix_fill_through_version`], which doesn't mutate the input archive but +#' might alias its fields. +#' +#' @param epi_archive an `epi_archive` object +#' @param fill_versions_end as in [`epix_fill_through_version`] +#' @param how as in [`epix_fill_through_version`] +#' +#' @importFrom data.table key setkeyv := address copy +#' @importFrom rlang arg_match +fill_through_version.epi_archive2 <- function( + epi_archive, + fill_versions_end, + how = c("na", "locf")) { + validate_version_bound(fill_versions_end, epi_archive$DT, na_ok = FALSE) + how <- arg_match(how) + if (epi_archive$versions_end < fill_versions_end) { + new_DT <- switch(how, + "na" = { + # old DT + a version consisting of all NA observations + # immediately after the last currently/actually-observed + # version. Note that this NA-observation version must only be + # added if `epi_archive` is outdated. + nonversion_key_cols <- setdiff(key(epi_archive$DT), "version") + nonkey_cols <- setdiff(names(epi_archive$DT), key(epi_archive$DT)) + next_version_tag <- next_after(epi_archive$versions_end) + if (next_version_tag > fill_versions_end) { + cli_abort(sprintf(paste( + "Apparent problem with `next_after` method:", + "archive contained observations through version %s", + "and the next possible version was supposed to be %s,", + "but this appeared to jump from a version < %3$s", + "to one > %3$s, implying at least one version in between." + ), epi_archive$versions_end, next_version_tag, fill_versions_end)) + } + nonversion_key_vals_ever_recorded <- unique(epi_archive$DT, by = nonversion_key_cols) + # In edge cases, the `unique` result can alias the original + # DT; detect and copy if necessary: + if (identical(address(epi_archive$DT), address(nonversion_key_vals_ever_recorded))) { + nonversion_key_vals_ever_recorded <- copy(nonversion_key_vals_ever_recorded) + } + next_version_DT <- nonversion_key_vals_ever_recorded[ + , version := next_version_tag + ][ + # this makes the class of these columns logical (`NA` is a + # logical NA; we're relying on the rbind below to convert to + # the proper class&typeof) + , (nonkey_cols) := NA + ] + # full result DT: + setkeyv(rbind(epi_archive$DT, next_version_DT), key(epi_archive$DT))[] + }, + "locf" = { + # just the old DT; LOCF is built into other methods: + epi_archive$DT + } + ) + new_versions_end <- fill_versions_end + # Update `epi_archive` all at once with simple, error-free operations + + # return below: + epi_archive$DT <- new_DT + epi_archive$versions_end <- new_versions_end + } else { + # Already sufficiently up to date; nothing to do. + } + return(invisible(epi_archive)) +} + + +#' @export +truncate_versions_after <- function(x, ...) { + UseMethod("truncate_versions_after") +} + + +#' Truncate versions after +#' @description Filter to keep only older versions, mutating the archive by +#' potentially reseating but not mutating some fields. `DT` is likely, but not +#' guaranteed, to be copied. Returns the mutated archive +#' [invisibly][base::invisible]. +#' @param epi_archive as in [`epix_truncate_versions_after`] +#' @param max_version as in [`epix_truncate_versions_after`] +truncate_versions_after.epi_archive2 <- function( + epi_archive, + max_version) { + if (!test_set_equal(class(max_version), class(epi_archive$DT$version))) { + cli_abort("`max_version` must have the same classes as `epi_archive$DT$version`.") + } + if (!test_set_equal(typeof(max_version), typeof(epi_archive$DT$version))) { + cli_abort("`max_version` must have the same types as `epi_archive$DT$version`.") + } + assert_scalar(max_version, na.ok = FALSE) + if (max_version > epi_archive$versions_end) { + cli_abort("`max_version` must be at most `epi_archive$versions_end`.") + } + epi_archive$DT <- epi_archive$DT[epi_archive$DT$version <= max_version, colnames(epi_archive$DT), with = FALSE] + # (^ this filter operation seems to always copy the DT, even if it + # keeps every entry; we don't guarantee this behavior in + # documentation, though, so we could change to alias in this case) + if (!is.na(epi_archive$clobberable_versions_start) && epi_archive$clobberable_versions_start > max_version) { + epi_archive$clobberable_versions_start <- NA + } + epi_archive$versions_end <- max_version + return(invisible(epi_archive)) +} + + +#' Merge epi archive +#' @description Merges another `epi_archive` with the current one, mutating the +#' current one by reseating its `DT` and several other fields, but avoiding +#' mutation of the old `DT`; returns the current archive +#' [invisibly][base::invisible]. See [`epix_merge`] for a full description +#' of the non-R6-method version, which does not mutate either archive, and +#' does not alias either archive's `DT`.a +#' @param x as in [`epix_merge`] +#' @param y as in [`epix_merge`] +#' @param sync as in [`epix_merge`] +#' @param compactify as in [`epix_merge`] +merge_epi_archive2 <- function( + x, + y, + sync = c("forbid", "na", "locf", "truncate"), + compactify = TRUE) { + result <- epix_merge(x, y, + sync = sync, + compactify = compactify + ) + + # TODO: Use encapsulating methods instead. + if (length(x$private_fields) != 0L) { + cli_abort("expected no private fields in x", + internal = TRUE + ) + } + + # Mutate fields all at once, trying to avoid any potential errors: + for (field_name in names(x$public_fields)) { + x[[field_name]] <- result[[field_name]] + } + + return(invisible(x)) +} + + +#' `group_by` and related methods for `epi_archive`, `grouped_epi_archive` +#' +#' @param .data An `epi_archive` or `grouped_epi_archive` +#' @param ... Similar to [`dplyr::group_by`] (see "Details:" for edge cases); +#' * For `group_by`: unquoted variable name(s) or other +#' ["data masking"][dplyr::dplyr_data_masking] expression(s). It's possible to +#' use [`dplyr::mutate`]-like syntax here to calculate new columns on which to +#' perform grouping, but note that, if you are regrouping an already-grouped +#' `.data` object, the calculations will be carried out ignoring such grouping +#' (same as [in dplyr][dplyr::group_by]). +#' * For `ungroup`: either +#' * empty, in order to remove the grouping and output an `epi_archive`; or +#' * variable name(s) or other ["tidy-select"][dplyr::dplyr_tidy_select] +#' expression(s), in order to remove the matching variables from the list of +#' grouping variables, and output another `grouped_epi_archive`. +#' @param .add Boolean. If `FALSE`, the default, the output will be grouped by +#' the variable selection from `...` only; if `TRUE`, the output will be +#' grouped by the current grouping variables plus the variable selection from +#' `...`. +#' @param .drop As described in [`dplyr::group_by`]; determines treatment of +#' factor columns. +#' @param x For `groups` or `ungroup`: a `grouped_epi_archive`; for +#' `is_grouped_epi_archive`: any object +#' @param .tbl (For `group_by_drop_default`:) an `epi_archive` or +#' `grouped_epi_archive` (`epi_archive` dispatches to the S3 default method; +#' `grouped_epi_archive` dispatches its own S3 method) +#' +#' @details +#' +#' To match `dplyr`, `group_by` allows "data masking" (also referred to as +#' "tidy evaluation") expressions `...`, not just column names, in a way similar +#' to `mutate`. Note that replacing or removing key columns with these +#' expressions is disabled. +#' +#' `archive %>% group_by()` and other expressions that group or regroup by zero +#' columns (indicating that all rows should be treated as part of one large +#' group) will output a `grouped_epi_archive`, in order to enable the use of +#' `grouped_epi_archive` methods on the result. This is in slight contrast to +#' the same operations on tibbles and grouped tibbles, which will *not* output a +#' `grouped_df` in these circumstances. +#' +#' Using `group_by` with `.add=FALSE` to override the existing grouping is +#' disabled; instead, `ungroup` first then `group_by`. +#' +#' Mutation and aliasing: `group_by` tries to use a shallow copy of the `DT`, +#' introducing column-level aliasing between its input and its result. This +#' doesn't follow the general model for most `data.table` operations, which +#' seems to be that, given an nonaliased (i.e., unique) pointer to a +#' `data.table` object, its pointers to its columns should also be nonaliased. +#' If you mutate any of the columns of either the input or result, first ensure +#' that it is fine if columns of the other are also mutated, but do not rely on +#' such behavior to occur. Additionally, never perform mutation on the key +#' columns at all (except for strictly increasing transformations), as this will +#' invalidate sortedness assumptions about the rows. +#' +#' `group_by_drop_default` on (ungrouped) `epi_archive`s is expected to dispatch +#' to `group_by_drop_default.default` (but there is a dedicated method for +#' `grouped_epi_archive`s). +#' +#' @examples +#' +#' grouped_archive <- archive_cases_dv_subset_2 %>% group_by(geo_value) +#' +#' # `print` for metadata and method listing: +#' grouped_archive %>% print() +#' +#' # The primary use for grouping is to perform a grouped `epix_slide`: +#' +#' archive_cases_dv_subset_2 %>% +#' group_by(geo_value) %>% +#' epix_slide2( +#' f = ~ mean(.x$case_rate_7d_av), +#' before = 2, +#' ref_time_values = as.Date("2020-06-11") + 0:2, +#' new_col_name = "case_rate_3d_av" +#' ) %>% +#' ungroup() +#' +#' # ----------------------------------------------------------------- +#' +#' # Advanced: some other features of dplyr grouping are implemented: +#' +#' library(dplyr) +#' toy_archive <- +#' tribble( +#' ~geo_value, ~age_group, ~time_value, ~version, ~value, +#' "us", "adult", "2000-01-01", "2000-01-02", 121, +#' "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) +#' "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) +#' "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) +#' ) %>% +#' mutate( +#' age_group = ordered(age_group, c("pediatric", "adult")), +#' time_value = as.Date(time_value), +#' version = as.Date(version) +#' ) %>% +#' as_epi_archive2(other_keys = "age_group") +#' +#' # The following are equivalent: +#' toy_archive %>% group_by(geo_value, age_group) +#' toy_archive %>% +#' group_by(geo_value) %>% +#' group_by(age_group, .add = TRUE) +#' grouping_cols <- c("geo_value", "age_group") +#' toy_archive %>% group_by(across(all_of(grouping_cols))) +#' +#' # And these are equivalent: +#' toy_archive %>% group_by(geo_value) +#' toy_archive %>% +#' group_by(geo_value, age_group) %>% +#' ungroup(age_group) +#' +#' # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): +#' toy_archive %>% +#' group_by(geo_value) %>% +#' groups() +#' +#' toy_archive %>% +#' group_by(geo_value, age_group, .drop = FALSE) %>% +#' epix_slide2(f = ~ sum(.x$value), before = 20) %>% +#' ungroup() +#' +#' @importFrom dplyr group_by +#' @export +#' +#' @aliases grouped_epi_archive +group_by.epi_archive2 <- function(epi_archive, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(epi_archive)) { + # `add` makes no difference; this is an ungrouped `epi_archive`. + detailed_mutate <- epix_detailed_restricted_mutate2(epi_archive, ...) + assert_logical(.drop) + if (!.drop) { + grouping_cols <- as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]] + grouping_col_is_factor <- purrr::map_lgl(grouping_cols, is.factor) + # ^ Use `as.list` to try to avoid any possibility of a deep copy. + if (!any(grouping_col_is_factor)) { + cli_warn( + "`.drop=FALSE` but there are no factor grouping columns; + did you mean to convert one of the columns to a factor beforehand?", + class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" + ) + } else if (any(diff(grouping_col_is_factor) == -1L)) { + cli_warn( + "`.drop=FALSE` but there are one or more non-factor grouping columns listed + after a factor grouping column; this may produce groups with `NA`s for these + columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; + depending on how you want completion to work, you might instead want to convert all + grouping columns to factors beforehand, specify the non-factor grouping columns first, + or use `.drop=TRUE` and add a call to `tidyr::complete`.", + class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor" + ) + } + } + new_grouped_epi_archive(detailed_mutate[["archive"]], + detailed_mutate[["request_names"]], + drop = .drop + ) +} + + +#' @export +slide <- function(.data, ...) { + UseMethod("slide") +} + + +#' Slide over epi archive +#' @description Slides a given function over variables in an `epi_archive` +#' object. See the documentation for the wrapper function [`epix_slide()`] for +#' details. The parameter descriptions below are copied from there +#' @importFrom data.table key +#' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms +#' @param f Function, formula, or missing; together with `...` specifies the +#' computation to slide. To "slide" means to apply a computation over a +#' sliding (a.k.a. "rolling") time window for each data group. The window is +#' determined by the `before` parameter described below. One time step is +#' typically one day or one week; see [`epi_slide`] details for more +#' explanation. If a function, `f` must take an `epi_df` with the same +#' column names as the archive's `DT`, minus the `version` column; followed +#' by a one-row tibble containing the values of the grouping variables for +#' the associated group; followed by a reference time value, usually as a +#' `Date` object; followed by any number of named arguments. If a formula, +#' `f` can operate directly on columns accessed via `.x$var` or `.$var`, as +#' in `~ mean (.x$var)` to compute a mean of a column `var` for each +#' group-`ref_time_value` combination. The group key can be accessed via +#' `.y` or `.group_key`, and the reference time value can be accessed via +#' `.z` or `.ref_time_value`. If `f` is missing, then `...` will specify the +#' computation. +#' @param ... Additional arguments to pass to the function or formula specified +#' via `f`. Alternatively, if `f` is missing, then `...` is interpreted as an +#' expression for tidy evaluation; in addition to referring to columns +#' directly by name, the expression has access to `.data` and `.env` pronouns +#' as in `dplyr` verbs, and can also refer to the `.group_key` and +#' `.ref_time_value`. See details of [`epi_slide`]. +#' @param before How far `before` each `ref_time_value` should the sliding +#' window extend? If provided, should be a single, non-NA, +#' [integer-compatible][vctrs::vec_cast] number of time steps. This window +#' endpoint is inclusive. For example, if `before = 7`, and one time step is +#' one day, then to produce a value for a `ref_time_value` of January 8, we +#' apply the given function or formula to data (for each group present) with +#' `time_value`s from January 1 onward, as they were reported on January 8. +#' For typical disease surveillance sources, this will not include any data +#' with a `time_value` of January 8, and, depending on the amount of reporting +#' latency, may not include January 7 or even earlier `time_value`s. (If +#' instead the archive were to hold nowcasts instead of regular surveillance +#' data, then we would indeed expect data for `time_value` January 8. If it +#' were to hold forecasts, then we would expect data for `time_value`s after +#' January 8, and the sliding window would extend as far after each +#' `ref_time_value` as needed to include all such `time_value`s.) +#' @param ref_time_values Reference time values / versions for sliding +#' computations; each element of this vector serves both as the anchor point +#' for the `time_value` window for the computation and the `max_version` +#' `as_of` which we fetch data in this window. If missing, then this will set +#' to a regularly-spaced sequence of values set to cover the range of +#' `version`s in the `DT` plus the `versions_end`; the spacing of values will +#' be guessed (using the GCD of the skips between values). +#' @param time_step Optional function used to define the meaning of one time +#' step, which if specified, overrides the default choice based on the +#' `time_value` column. This function must take a positive integer and return +#' an object of class `lubridate::period`. For example, we can use `time_step +#' = lubridate::hours` in order to set the time step to be one hour (this +#' would only be meaningful if `time_value` is of class `POSIXct`). +#' @param new_col_name String indicating the name of the new column that will +#' contain the derivative values. Default is "slide_value"; note that setting +#' `new_col_name` equal to an existing column name will overwrite this column. +#' @param as_list_col Should the slide results be held in a list column, or be +#' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, +#' in which case a list object returned by `f` would be unnested (using +#' [`tidyr::unnest()`]), and, if the slide computations output data frames, +#' the names of the resulting columns are given by prepending `new_col_name` +#' to the names of the list elements. +#' @param names_sep String specifying the separator to use in `tidyr::unnest()` +#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix +#' from `new_col_name` entirely. +#' @param all_versions (Not the same as `all_rows` parameter of `epi_slide`.) If +#' `all_versions = TRUE`, then `f` will be passed the version history (all +#' `version <= ref_time_value`) for rows having `time_value` between +#' `ref_time_value - before` and `ref_time_value`. Otherwise, `f` will be +#' passed only the most recent `version` for every unique `time_value`. +#' Default is `FALSE`. +slide.epi_archive2 <- function(epi_archive, f, ..., before, ref_time_values, + time_step, new_col_name = "slide_value", + as_list_col = FALSE, names_sep = "_", + all_versions = FALSE) { + # For an "ungrouped" slide, treat all rows as belonging to one big + # group (group by 0 vars), like `dplyr::summarize`, and let the + # resulting `grouped_epi_archive` handle the slide: + slide( + group_by(epi_archive), + f, + ..., + before = before, ref_time_values = ref_time_values, + time_step = time_step, new_col_name = new_col_name, + as_list_col = as_list_col, names_sep = names_sep, + all_versions = all_versions + ) %>% + # We want a slide on ungrouped archives to output something + # ungrouped, rather than retaining the trivial (0-variable) + # grouping applied above. So we `ungroup()`. However, the current + # `dplyr` implementation automatically ignores/drops trivial + # groupings, so this is just a no-op for now. + ungroup() +} + + +#' Convert to `epi_archive` format +#' +#' Converts a data frame, data table, or tibble into an `epi_archive` +#' object. See the [archive +#' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for +#' examples. The parameter descriptions below are copied from there +#' +#' @param x A data frame, data table, or tibble, with columns `geo_value`, +#' `time_value`, `version`, and then any additional number of columns. +#' @param geo_type Type for the geo values. If missing, then the function will +#' attempt to infer it from the geo values present; if this fails, then it +#' will be set to "custom". +#' @param time_type Type for the time values. If missing, then the function will +#' attempt to infer it from the time values present; if this fails, then it +#' will be set to "custom". +#' @param other_keys Character vector specifying the names of variables in `x` +#' that should be considered key variables (in the language of `data.table`) +#' apart from "geo_value", "time_value", and "version". +#' @param additional_metadata List of additional metadata to attach to the +#' `epi_archive` object. The metadata will have `geo_type` and `time_type` +#' fields; named entries from the passed list or will be included as well. +#' @param compactify Optional; Boolean or `NULL`: should we remove rows that are +#' considered redundant for the purposes of `epi_archive`'s built-in methods +#' such as `as_of`? As these methods use the last version of each observation +#' carried forward (LOCF) to interpolate between the version data provided, +#' rows that don't change these LOCF results can potentially be omitted to +#' save space. `TRUE` will remove these rows, `FALSE` will not, and missing or +#' `NULL` will remove these rows and issue a warning. Generally, this can be +#' set to `TRUE`, but if you directly inspect or edit the fields of the +#' `epi_archive` such as its `DT`, you will have to determine whether +#' `compactify=TRUE` will produce the desired results. If compactification +#' here is removing a large proportion of the rows, this may indicate a +#' potential for space, time, or bandwidth savings upstream the data pipeline, +#' e.g., when fetching, storing, or preparing the input data `x` +#' @param clobberable_versions_start Optional; `length`-1; either a value of the +#' same `class` and `typeof` as `x$version`, or an `NA` of any `class` and +#' `typeof`: specifically, either (a) the earliest version that could be +#' subject to "clobbering" (being overwritten with different update data, but +#' using the *same* version tag as the old update data), or (b) `NA`, to +#' indicate that no versions are clobberable. There are a variety of reasons +#' why versions could be clobberable under routine circumstances, such as (a) +#' today's version of one/all of the columns being published after initially +#' being filled with `NA` or LOCF, (b) a buggy version of today's data being +#' published but then fixed and republished later in the day, or (c) data +#' pipeline delays (e.g., publisher uploading, periodic scraping, database +#' syncing, periodic fetching, etc.) that make events (a) or (b) reflected +#' later in the day (or even on a different day) than expected; potential +#' causes vary between different data pipelines. The default value is `NA`, +#' which doesn't consider any versions to be clobberable. Another setting that +#' may be appropriate for some pipelines is `max_version_with_row_in(x)`. +#' @param versions_end Optional; length-1, same `class` and `typeof` as +#' `x$version`: what is the last version we have observed? The default is +#' `max_version_with_row_in(x)`, but values greater than this could also be +#' valid, and would indicate that we observed additional versions of the data +#' beyond `max(x$version)`, but they all contained empty updates. (The default +#' value of `clobberable_versions_start` does not fully trust these empty +#' updates, and assumes that any version `>= max(x$version)` could be +#' clobbered.) If `nrow(x) == 0`, then this argument is mandatory. +#' @return An `epi_archive` object. +#' +#' @details This simply a wrapper around the `new()` method of the `epi_archive` +#' class, so for example: +#' ``` +#' x <- as_epi_archive(df, geo_type = "state", time_type = "day") +#' ``` +#' would be equivalent to: +#' ``` +#' x <- epi_archive$new(df, geo_type = "state", time_type = "day") +#' ``` +#' +#' @export +#' @examples +#' # Simple ex. with necessary keys +#' tib <- tibble::tibble( +#' geo_value = rep(c("ca", "hi"), each = 5), +#' time_value = rep(seq(as.Date("2020-01-01"), +#' by = 1, length.out = 5 +#' ), times = 2), +#' version = rep(seq(as.Date("2020-01-02"), +#' by = 1, length.out = 5 +#' ), times = 2), +#' value = rnorm(10, mean = 2, sd = 1) +#' ) +#' +#' toy_epi_archive <- tib %>% as_epi_archive2( +#' geo_type = "state", +#' time_type = "day" +#' ) +#' toy_epi_archive +#' +#' # Ex. with an additional key for county +#' df <- data.frame( +#' geo_value = c(replicate(2, "ca"), replicate(2, "fl")), +#' county = c(1, 3, 2, 5), +#' time_value = c( +#' "2020-06-01", +#' "2020-06-02", +#' "2020-06-01", +#' "2020-06-02" +#' ), +#' version = c( +#' "2020-06-02", +#' "2020-06-03", +#' "2020-06-02", +#' "2020-06-03" +#' ), +#' cases = c(1, 2, 3, 4), +#' cases_rate = c(0.01, 0.02, 0.01, 0.05) +#' ) +#' +#' x <- df %>% as_epi_archive2( +#' geo_type = "state", +#' time_type = "day", +#' other_keys = "county" +#' ) +as_epi_archive2 <- function(x, geo_type, time_type, other_keys, + additional_metadata = list(), + compactify = NULL, + clobberable_versions_start = NA, + versions_end = max_version_with_row_in(x)) { + new_epi_archive2( + x, geo_type, time_type, other_keys, additional_metadata, + compactify, clobberable_versions_start, versions_end + ) +} + +#' Test for `epi_archive` format +#' +#' @param x An object. +#' @param grouped_okay Optional; Boolean; should a `grouped_epi_archive` also +#' count? Default is `FALSE`. +#' @return `TRUE` if the object inherits from `epi_archive`. +#' +#' @export +#' @examples +#' is_epi_archive2(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive) +#' is_epi_archive2(archive_cases_dv_subset_2) # TRUE +#' +#' # By default, grouped_epi_archives don't count as epi_archives, as they may +#' # support a different set of operations from regular `epi_archives`. This +#' # behavior can be controlled by `grouped_okay`. +#' grouped_archive <- archive_cases_dv_subset_2 %>% group_by(geo_value) +#' is_epi_archive2(grouped_archive) # FALSE +#' is_epi_archive2(grouped_archive, grouped_okay = TRUE) # TRUE +#' +#' @seealso [`is_grouped_epi_archive`] +is_epi_archive2 <- function(x, grouped_okay = FALSE) { + inherits(x, "epi_archive2") || grouped_okay && inherits(x, "grouped_epi_archive2") +} + + +#' @export +clone <- function(x, ...) { + UseMethod("clone") +} + + +#' @export +clone.epi_archive2 <- function(epi_archive, deep = FALSE) { + # TODO: Finish. + if (deep) { + epi_archive$DT <- copy(epi_archive$DT) + } else { + epi_archive$DT <- copy(epi_archive$DT) + } + return(epi_archive) +} diff --git a/R/data.R b/R/data.R index 26b9f39f..37ccc522 100644 --- a/R/data.R +++ b/R/data.R @@ -289,3 +289,11 @@ delayed_assign_with_unregister_awareness( #' * Furthermore, the data has been limited to a very small number of rows, the #' signal names slightly altered, and formatted into a tibble. "jhu_csse_county_level_subset" + +#' @export +"archive_cases_dv_subset_2" + +delayed_assign_with_unregister_awareness( + "archive_cases_dv_subset_2", + as_epi_archive2(archive_cases_dv_subset_dt, compactify = FALSE) +) diff --git a/R/grouped_archive_new.R b/R/grouped_archive_new.R new file mode 100644 index 00000000..c0e6c35e --- /dev/null +++ b/R/grouped_archive_new.R @@ -0,0 +1,456 @@ +#' +#' Convenience function for performing a `tidy_select` on dots according to its +#' docs, and taking the names (rather than the integer indices). +#' +#' @param ... tidyselect-syntax selection description +#' @param .data named vector / data frame; context for the description / the +#' object to which the selections apply +#' @return character vector containing names of entries/columns of +#' `names(.data)` denoting the selection +#' +#' @noRd +eval_pure_select_names_from_dots <- function(..., .data) { + # `?tidyselect::eval_select` tells us to use this form when we take in dots. + # It seems a bit peculiar, since the expr doesn't pack with it a way to get at + # the environment for the dots, but it looks like `eval_select` will assume + # the caller env (our `environment()`) when given an expr, and thus have + # access to the dots. + # + # If we were allowing renaming, we'd need to be careful about which names (new + # vs. old vs. both) to return here. + names(tidyselect::eval_select(rlang::expr(c(...)), .data, allow_rename = FALSE)) +} + +#' Get names of dots without forcing the dots +#' +#' For use in functions that use nonstandard evaluation (NSE) on the dots; we +#' can't use the pattern `names(list(...))` in this case because it will attempt +#' to force/(standard-)evaluate the dots, and we want to avoid attempted forcing of the +#' dots if we're using NSE. +#' +#' @noRd +nse_dots_names <- function(...) { + names(rlang::call_match()) +} +nse_dots_names2 <- function(...) { + rlang::names2(rlang::call_match()) +} + +#' @importFrom dplyr group_by_drop_default +#' @noRd +new_grouped_epi_archive <- function(ungrouped, vars, drop) { + if (inherits(ungrouped, "grouped_epi_archive")) { + cli_abort( + "`ungrouped` must not already be grouped (neither automatic regrouping + nor nested grouping is supported). Either use `group_by` with `.add=TRUE`, + or `ungroup` first.", + class = "epiprocess__grouped_epi_archive__ungrouped_arg_is_already_grouped", + epiprocess__ungrouped_class = class(ungrouped), + epiprocess__ungrouped_groups = groups(ungrouped) + ) + } + assert_class(ungrouped, "epi_archive2") + assert_character(vars) + if (!test_subset(vars, names(ungrouped$DT))) { + cli_abort( + "All grouping variables `vars` must be present in the data.", + ) + } + if ("version" %in% vars) { + cli_abort("`version` has a special interpretation and cannot be used by itself as a grouping variable") + } + assert_logical(drop, len = 1) + + # ----- + private <- list() + private$ungrouped <- ungrouped + private$vars <- vars + private$drop <- drop + + return(structure( + list( + private = private + ), + class = c("grouped_epi_archive2", "epi_archive2") + )) +} + +#' @export +print.grouped_epi_archive2 <- function(grouped_epi_archive, class = TRUE) { + if (class) cat("A `grouped_epi_archive` object:\n") + writeLines(wrap_varnames(grouped_epi_archive$private$vars, initial = "* Groups: ")) + # If none of the grouping vars is a factor, then $drop doesn't seem + # relevant, so try to be less verbose and don't message about it. + # + # Below map-then-extract may look weird, but the more natural + # extract-then-map appears to trigger copies of the extracted columns + # since we are working with a `data.table` (unless we go through + # `as.list`, but its current column-aliasing behavior is probably not + # something to rely too much on), while map functions currently appear + # to avoid column copies. + if (any(purrr::map_lgl(grouped_epi_archive$private$ungrouped$DT, is.factor)[grouped_epi_archive$private$vars])) { + cat(strwrap(init = "* ", prefix = " ", sprintf( + "%s groups formed by factor levels that don't appear in the data", + if (grouped_epi_archive$private$drop) "Drops" else "Does not drop" + ))) + cat("\n") + } + cat("It wraps an ungrouped `epi_archive`, with metadata:\n") + print(grouped_epi_archive$private$ungrouped, class = FALSE) + # Return self invisibly for convenience in `$`-"pipe": + invisible(grouped_epi_archive) +} + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @importFrom dplyr group_by +#' @export +group_by.grouped_epi_archive2 <- function( + grouped_epi_archive, + ..., + .add = FALSE, + .drop = dplyr::group_by_drop_default(grouped_epi_archive)) { + assert_logical(.add, len = 1) + if (!.add) { + cli_abort('`group_by` on a `grouped_epi_archive` with `.add=FALSE` is forbidden + (neither automatic regrouping nor nested grouping is supported). + If you want to "regroup", replacing the existing grouping vars, `ungroup` first and then `group_by`. + If you want to add to the existing grouping vars, call `group_by` specifying `.add=TRUE`. + ', + class = "epiprocess__grouped_epi_archive_group_by_with_add_FALSE" + ) + } else { + # `group_by` `...` computations are performed on ungrouped data (see + # `?dplyr::group_by`) + detailed_mutate <- epix_detailed_restricted_mutate2(grouped_epi_archive$private$ungrouped, ...) + out_ungrouped <- detailed_mutate[["archive"]] + vars_from_dots <- detailed_mutate[["request_names"]] + vars <- union(grouped_epi_archive$private$vars, vars_from_dots) + new_grouped_epi_archive(grouped_epi_archive$private$ungrouped, vars, .drop) + } +} + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @export +group_by_drop_default.grouped_epi_archive2 <- function(grouped_epi_archive) { + grouped_epi_archive$private$drop +} + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @importFrom dplyr groups +#' @export +groups.grouped_epi_archive2 <- function(grouped_epi_archive) { + rlang::syms(grouped_epi_archive$private$vars) +} + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @importFrom dplyr ungroup +#' @export +ungroup.grouped_epi_archive2 <- function(grouped_epi_archive, ...) { + if (rlang::dots_n(...) == 0L) { + # No dots = special behavior: remove all grouping vars and convert to + # an ungrouped class, as with `grouped_df`s. + grouped_epi_archive$private$ungrouped + } else { + exclude_vars <- eval_pure_select_names_from_dots(..., .data = grouped_epi_archive$private$ungrouped$DT) + # (requiring a pure selection here is a little stricter than dplyr + # implementations, but passing a renaming selection into `ungroup` + # seems pretty weird.) + result_vars <- grouped_epi_archive$private$vars[!grouped_epi_archive$private$vars %in% exclude_vars] + # `vars` might be length 0 if the user's tidyselection removed all + # grouping vars. Unlike with tibble, opt here to keep the result as a + # grouped_epi_archive, for output class consistency when `...` is + # provided. + new_grouped_epi_archive(grouped_epi_archive$private$ungrouped, result_vars, grouped_epi_archive$private$drop) + } +} + +#' Truncate versions after a given version, grouped +#' @description Filter to keep only older versions by mutating the underlying +#' `epi_archive` using `$truncate_versions_after`. Returns the mutated +#' `grouped_epi_archive` [invisibly][base::invisible]. +#' @param x as in [`epix_truncate_versions_after`] +#' @param max_version as in [`epix_truncate_versions_after`] +#' @export +truncate_versions_after.grouped_epi_archive2 <- function(grouped_epi_archive, max_version) { + # The grouping is irrelevant for this method; if we were to split into + # groups and recombine appropriately, we should get the same result as + # just leveraging the ungrouped method, so just do the latter: + truncate_versions_after(grouped_epi_archive$private$ungrouped, max_version) + return(invisible(grouped_epi_archive)) +} + +#' Truncate versions after a given version, grouped +#' @export +epix_truncate_versions_after.grouped_epi_archive2 <- function(grouped_epi_archive, max_version) { + cloned_group_epi_archive <- clone(grouped_epi_archive, deep = TRUE) + return((truncate_versions_after(cloned_group_epi_archive, max_version))) + # ^ second set of parens drops invisibility +} + + +#' Slide over grouped epi archive +#' @description Slides a given function over variables in a `grouped_epi_archive` +#' object. See the documentation for the wrapper function [`epix_slide()`] for +#' details. +#' @importFrom data.table key address rbindlist setDF +#' @importFrom tibble as_tibble new_tibble validate_tibble +#' @importFrom dplyr group_by groups +#' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms +#' env missing_arg +#' @export +slide.grouped_epi_archive2 <- function(grouped_epi_archive, f, ..., before, ref_time_values, + time_step, new_col_name = "slide_value", + as_list_col = FALSE, names_sep = "_", + all_versions = FALSE) { + # Perform some deprecated argument checks without using ` = + # deprecated()` in the function signature, because they are from + # early development versions and much more likely to be clutter than + # informative in the signature. + if ("group_by" %in% nse_dots_names(...)) { + cli_abort(" + The `group_by` argument to `slide` has been removed; please use + the `group_by` S3 generic function or `$group_by` R6 method + before the slide instead. (If you were instead trying to pass a + `group_by` argument to `f` or create a column named `group_by`, + this check is a false positive, but you will still need to use a + different column name here and rename the resulting column after + the slide.) + ", class = "epiprocess__epix_slide_group_by_parameter_deprecated") + } + if ("all_rows" %in% nse_dots_names(...)) { + cli_abort(" + The `all_rows` argument has been removed from `epix_slide` (but + is still supported in `epi_slide`). Add rows for excluded + results with a manual join instead. + ", class = "epiprocess__epix_slide_all_rows_parameter_deprecated") + } + + if (missing(ref_time_values)) { + ref_time_values <- epix_slide_ref_time_values_default(grouped_epi_archive$private$ungrouped) + } else { + assert_numeric(ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) + if (any(ref_time_values > grouped_epi_archive$private$ungrouped$versions_end)) { + cli_abort("Some `ref_time_values` are greater than the latest version in the archive.") + } + if (anyDuplicated(ref_time_values) != 0L) { + cli_abort("Some `ref_time_values` are duplicated.") + } + # Sort, for consistency with `epi_slide`, although the current + # implementation doesn't take advantage of it. + ref_time_values <- sort(ref_time_values) + } + + # Validate and pre-process `before`: + if (missing(before)) { + cli_abort("`before` is required (and must be passed by name); + if you did not want to apply a sliding window but rather + to map `as_of` and `f` across various `ref_time_values`, + pass a large `before` value (e.g., if time steps are days, + `before=365000`).") + } + before <- vctrs::vec_cast(before, integer()) + assert_int(before, lower = 0L, null.ok = FALSE, na.ok = FALSE) + + # If a custom time step is specified, then redefine units + + if (!missing(time_step)) before <- time_step(before) + + # Symbolize column name + new_col <- sym(new_col_name) + + # Validate rest of parameters: + assert_logical(as_list_col, len = 1L) + assert_logical(all_versions, len = 1L) + assert_character(names_sep, len = 1L, null.ok = TRUE) + + # Computation for one group, one time value + comp_one_grp <- function(.data_group, .group_key, + f, ..., + ref_time_value, + new_col) { + # Carry out the specified computation + comp_value <- f(.data_group, .group_key, ref_time_value, ...) + + if (all_versions) { + # Extract data from archive so we can do length checks below. When + # `all_versions = TRUE`, `.data_group` will always be an ungrouped + # archive because of the preceding `as_of` step. + .data_group <- .data_group$DT + } + + assert( + check_atomic(comp_value, any.missing = TRUE), + check_data_frame(comp_value), + combine = "or", + .var.name = vname(comp_value) + ) + + # Label every result row with the `ref_time_value` + res <- list(time_value = ref_time_value) + + # Wrap the computation output in a list and unchop/unnest later if + # `as_list_col = FALSE`. This approach means that we will get a + # list-class col rather than a data.frame-class col when + # `as_list_col = TRUE` and the computations outputs are data + # frames. + res[[new_col]] <- list(comp_value) + + # Convert the list to a tibble all at once for speed. + return(validate_tibble(new_tibble(res))) + } + + # If `f` is missing, interpret ... as an expression for tidy evaluation + if (missing(f)) { + quos <- enquos(...) + if (length(quos) == 0) { + cli_abort("If `f` is missing then a computation must be specified via `...`.") + } + if (length(quos) > 1) { + cli_abort("If `f` is missing then only a single computation can be specified via `...`.") + } + + f <- quos[[1]] + new_col <- sym(names(rlang::quos_auto_name(quos))) + ... <- missing_arg() # magic value that passes zero args as dots in calls below + } + + f <- as_slide_computation(f, ...) + x <- lapply(ref_time_values, function(ref_time_value) { + # Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`, + # `epi_archive` if `all_versions` is `TRUE`: + as_of_raw <- as_of(grouped_epi_archive$private$ungrouped, + ref_time_value, + min_time_value = ref_time_value - before, + all_versions = all_versions + ) + + # Set: + # * `as_of_df`, the data.frame/tibble/epi_df/etc. that we will + # `group_modify` as the `.data` argument. Might or might not + # include version column. + # * `group_modify_fn`, the corresponding `.f` argument + if (!all_versions) { + as_of_df <- as_of_raw + group_modify_fn <- comp_one_grp + } else { + as_of_archive <- as_of_raw + # We essentially want to `group_modify` the archive, but + # haven't implemented this method yet. Next best would be + # `group_modify` on its `$DT`, but that has different + # behavior based on whether or not `dtplyr` is loaded. + # Instead, go through an ordinary data frame, trying to avoid + # copies. + if (address(as_of_archive$DT) == address(grouped_epi_archive$private$ungrouped$DT)) { + # `as_of` aliased its the full `$DT`; copy before mutating: + # + # Note: this step is probably unneeded; we're fine with + # aliasing of the DT or its columns: vanilla operations aren't + # going to mutate them in-place if they are aliases, and we're + # not performing mutation (unlike the situation with + # `fill_through_version` where we do mutate a `DT` and don't + # want aliasing). + as_of_archive$DT <- copy(as_of_archive$DT) + } + dt_key <- data.table::key(as_of_archive$DT) + as_of_df <- as_of_archive$DT + data.table::setDF(as_of_df) + + # Convert each subgroup chunk to an archive before running the calculation. + group_modify_fn <- function(.data_group, .group_key, + f, ..., + ref_time_value, + new_col) { + # .data_group is coming from as_of_df as a tibble, but we + # want to feed `comp_one_grp` an `epi_archive` backed by a + # DT; convert and wrap: + data.table::setattr(.data_group, "sorted", dt_key) + data.table::setDT(.data_group, key = dt_key) + .data_group_archive <- clone(as_of_archive) + .data_group_archive$DT <- .data_group + comp_one_grp(.data_group_archive, .group_key, + f = f, ..., + ref_time_value = ref_time_value, + new_col = new_col + ) + } + } + + return( + dplyr::group_modify( + dplyr::group_by(as_of_df, !!!syms(grouped_epi_archive$private$vars), .drop = grouped_epi_archive$private$drop), + group_modify_fn, + f = f, ..., + ref_time_value = ref_time_value, + new_col = new_col, + .keep = TRUE + ) + ) + }) + # Combine output into a single tibble + x <- as_tibble(setDF(rbindlist(x))) + # Reconstruct groups + x <- group_by(x, !!!syms(grouped_epi_archive$private$vars), .drop = grouped_epi_archive$private$drop) + + # Unchop/unnest if we need to + if (!as_list_col) { + x <- tidyr::unnest(x, !!new_col, names_sep = names_sep) + } + + # if (is_epi_df(x)) { + # # The analogue of `epi_df`'s `as_of` metadata for an archive is + # # `$versions_end`, at least in the current absence of + # # separate fields/columns denoting the "archive version" with a + # # different resolution, or from the perspective of a different + # # stage of a data pipeline. The `as_of` that is automatically + # # derived won't always match; override: + # attr(x, "metadata")[["as_of"]] <- private$ungrouped$versions_end + # } + + # XXX We need to work out when we want to return an `epi_df` and how + # to get appropriate keys (see #290, #223, #163). We'll probably + # need the commented-out code above if we ever output an `epi_df`. + # However, as a stopgap measure to have some more consistency across + # different ways of calling `epix_slide`, and to prevent `epi_df` + # output with invalid metadata, always output a (grouped or + # ungrouped) tibble. + x <- decay_epi_df(x) + + return(x) +} + + +# At time of writing, roxygen parses content in collation order, impacting the +# presentation of .Rd files that document multiple functions (see +# https://github.com/r-lib/roxygen2/pull/324). Use @include tags (determining +# `Collate:`) and ordering of functions within each file in order to get the +# desired ordering. + + + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @export +is_grouped_epi_archive2 <- function(x) { + inherits(x, "grouped_epi_archive2") +} + + +#' @export +clone.grouped_epi_archive2 <- function(x, deep = FALSE) { + # TODO: Finish. + if (deep) { + ungrouped <- clone(x$private$ungrouped, deep = TRUE) + } else { + ungrouped <- x$private$ungrouped + } + new_grouped_epi_archive(ungrouped, x$private$vars, x$private$drop) +} diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 4bcead66..213cf1b1 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -470,9 +470,8 @@ dplyr_col_modify.col_modify_recorder_df <- function(data, cols) { #' @noRd epix_detailed_restricted_mutate <- function(.data, ...) { # We don't want to directly use `dplyr::mutate` on the `$DT`, as: - # - this likely copies the entire table # - `mutate` behavior, including the output class, changes depending on - # whether `dtplyr` is loaded and would require post-processing + # whether `dtplyr` < 1.3.0 is loaded and would require post-processing # - behavior with `dtplyr` isn't fully compatible # - it doesn't give the desired details, and `rlang::exprs_auto_name` does not # appropriately handle the `= NULL` and `= ` tidyeval cases diff --git a/R/methods-epi_archive_new.R b/R/methods-epi_archive_new.R new file mode 100644 index 00000000..3ce39afc --- /dev/null +++ b/R/methods-epi_archive_new.R @@ -0,0 +1,826 @@ +#' Generate a snapshot from an `epi_archive` object +#' +#' Generates a snapshot in `epi_df` format from an `epi_archive` object, as of a +#' given version. See the [archive +#' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for +#' examples. +#' +#' @param x An `epi_archive` object +#' @param max_version Time value specifying the max version to permit in the +#' snapshot. That is, the snapshot will comprise the unique rows of the +#' current archive data that represent the most up-to-date signal values, as +#' of the specified `max_version` (and whose time values are at least +#' `min_time_value`.) +#' @param min_time_value Time value specifying the min time value to permit in +#' the snapshot. Default is `-Inf`, which effectively means that there is no +#' minimum considered. +#' @param all_versions If `all_versions = TRUE`, then the output will be in +#' `epi_archive` format, and contain rows in the specified `time_value` range +#' having `version <= max_version`. The resulting object will cover a +#' potentially narrower `version` and `time_value` range than `x`, depending +#' on user-provided arguments. Otherwise, there will be one row in the output +#' for the `max_version` of each `time_value`. Default is `FALSE`. +#' @return An `epi_df` object. +#' +#' @details This is simply a wrapper around the `as_of()` method of the +#' `epi_archive` class, so if `x` is an `epi_archive` object, then: +#' ``` +#' epix_as_of(x, max_version = v) +#' ``` +#' is equivalent to: +#' ``` +#' x$as_of(max_version = v) +#' ``` +#' +#' Mutation and aliasing: `epix_as_of` and `$as_of` will not mutate the input +#' archives, but may in some edge cases alias parts of the inputs, so copy the +#' outputs if needed before using mutating operations like `data.table`'s `:=` +#' operator. Currently, the only situation where there is potentially aliasing +#' is of the `DT` in edge cases with `all_versions = TRUE`, but this may change +#' in the future. +#' +#' @examples +#' # warning message of data latency shown +#' epix_as_of2( +#' archive_cases_dv_subset_2, +#' max_version = max(archive_cases_dv_subset_2$DT$version) +#' ) +#' +#' range(archive_cases_dv_subset_2$DT$version) # 2020-06-02 -- 2021-12-01 +#' +#' epix_as_of2( +#' archive_cases_dv_subset_2, +#' max_version = as.Date("2020-06-12") +#' ) +#' +#' # When fetching a snapshot as of the latest version with update data in the +#' # archive, a warning is issued by default, as this update data might not yet +#' # be finalized (for example, if data versions are labeled with dates, these +#' # versions might be overwritten throughout the corresponding days with +#' # additional data or "hotfixes" of erroroneous data; when we build an archive +#' # based on database queries, the latest available update might still be +#' # subject to change, but previous versions should be finalized). We can +#' # muffle such warnings with the following pattern: +#' withCallingHandlers( +#' { +#' epix_as_of2( +#' archive_cases_dv_subset_2, +#' max_version = max(archive_cases_dv_subset_2$DT$version) +#' ) +#' }, +#' epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") +#' ) +#' # Since R 4.0, there is a `globalCallingHandlers` function that can be used +#' # to globally toggle these warnings. +#' +#' @export +epix_as_of2 <- function(epi_archive, max_version, min_time_value = -Inf, all_versions = FALSE) { + assert_class(epi_archive, "epi_archive2") + return(as_of(epi_archive, max_version, min_time_value, all_versions = all_versions)) +} + +#' `epi_archive` with unobserved history filled in (won't mutate, might alias) +#' +#' Sometimes, due to upstream data pipeline issues, we have to work with a +#' version history that isn't completely up to date, but with functions that +#' expect archives that are completely up to date, or equally as up-to-date as +#' another archive. This function provides one way to approach such mismatches: +#' pretend that we've "observed" additional versions, filling in these versions +#' with NAs or extrapolated values. +#' +#' '`epix_fill_through_version` will not mutate its `x` argument, but its result +#' might alias fields of `x` (e.g., mutating the result's `DT` might mutate +#' `x$DT`). The R6 method variant, `x$fill_through_version`, will mutate `x` to +#' give the result, but might reseat its fields (e.g., references to the old +#' `x$DT` might not be updated by this function or subsequent operations on +#' `x`), and returns the updated `x` [invisibly][base::invisible]. +#' +#' @param x An `epi_archive` +#' @param fill_versions_end Length-1, same class&type as `x$version`: the +#' version through which to fill in missing version history; this will be the +#' result's `$versions_end` unless it already had a later +#' `$versions_end`. +#' @param how Optional; `"na"` or `"locf"`: `"na"` will fill in any missing +#' required version history with `NA`s, by inserting (if necessary) an update +#' immediately after the current `$versions_end` that revises all +#' existing measurements to be `NA` (this is only supported for `version` +#' classes with a `next_after` implementation); `"locf"` will fill in missing +#' version history with the last version of each observation carried forward +#' (LOCF), by leaving the update `$DT` alone (other `epi_archive` methods are +#' based on LOCF). Default is `"na"`. +#' @return An `epi_archive` +epix_fill_through_version2 <- function(epi_archive, fill_versions_end, + how = c("na", "locf")) { + assert_class(epi_archive, "epi_archive2") + cloned_epi_archive <- clone(epi_archive) + # Enclosing parentheses drop the invisibility flag. See description above of + # potential mutation and aliasing behavior. + (fill_through_version(cloned_epi_archive, fill_versions_end, how = how)) +} + +#' Merge two `epi_archive` objects +#' +#' Merges two `epi_archive`s that share a common `geo_value`, `time_value`, and +#' set of key columns. When they also share a common `versions_end`, +#' using `$as_of` on the result should be the same as using `$as_of` on `x` and +#' `y` individually, then performing a full join of the `DT`s on the non-version +#' key columns (potentially consolidating multiple warnings about clobberable +#' versions). If the `versions_end` values differ, the +#' `sync` parameter controls what is done. +#' +#' This function, [`epix_merge`], does not mutate its inputs and will not alias +#' either archive's `DT`, but may alias other fields; `x$merge` will overwrite +#' `x` with the result of the merge, reseating its `DT` and several other fields +#' (making them point to different objects), but avoiding mutation of the +#' contents of the old `DT` (only relevant if you have another reference to the +#' old `DT` in another object). +#' +#' @param x,y Two `epi_archive` objects to join together. +#' @param sync Optional; `"forbid"`, `"na"`, `"locf"`, or `"truncate"`; in the +#' case that `x$versions_end` doesn't match `y$versions_end`, what do we do?: +#' `"forbid"`: emit an error; "na": use `max(x$versions_end, y$versions_end)` +#' as the result's `versions_end`, but ensure that, if we request a snapshot +#' as of a version after `min(x$versions_end, y$versions_end)`, the +#' observation columns from the less up-to-date archive will be all NAs (i.e., +#' imagine there was an update immediately after its `versions_end` which +#' revised all observations to be `NA`); `"locf"`: use `max(x$versions_end, +#' y$versions_end)` as the result's `versions_end`, allowing the last version +#' of each observation to be carried forward to extrapolate unavailable +#' versions for the less up-to-date input archive (i.e., imagining that in the +#' less up-to-date archive's data set remained unchanged between its actual +#' `versions_end` and the other archive's `versions_end`); or `"truncate"`: +#' use `min(x$versions_end, y$versions_end)` as the result's `versions_end`, +#' and discard any rows containing update rows for later versions. +#' @param compactify Optional; `TRUE`, `FALSE`, or `NULL`; should the result be +#' compactified? See [`as_epi_archive`] for an explanation of what this means. +#' Default here is `TRUE`. +#' @return the resulting `epi_archive` +#' +#' @details In all cases, `additional_metadata` will be an empty list, and +#' `clobberable_versions_start` will be set to the earliest version that could +#' be clobbered in either input archive. +#' +#' @examples +#' # create two example epi_archive datasets +#' x <- archive_cases_dv_subset_2$DT %>% +#' dplyr::select(geo_value, time_value, version, case_rate_7d_av) %>% +#' as_epi_archive2(compactify = TRUE) +#' y <- archive_cases_dv_subset_2$DT %>% +#' dplyr::select(geo_value, time_value, version, percent_cli) %>% +#' as_epi_archive2(compactify = TRUE) +#' # merge results stored in a third object: +#' xy <- epix_merge2(x, y) +#' +#' @importFrom data.table key set setkeyv +#' @export +epix_merge2 <- function(x, y, + sync = c("forbid", "na", "locf", "truncate"), + compactify = TRUE) { + assert_class(x, "epi_archive2") + assert_class(y, "epi_archive2") + sync <- rlang::arg_match(sync) + + if (!identical(x$geo_type, y$geo_type)) { + cli_abort("`x` and `y` must have the same `$geo_type`") + } + + if (!identical(x$time_type, y$time_type)) { + cli_abort("`x` and `y` must have the same `$time_type`") + } + + if (length(x$additional_metadata) != 0L) { + cli_warn("x$additional_metadata won't appear in merge result", + class = "epiprocess__epix_merge_ignores_additional_metadata" + ) + } + if (length(y$additional_metadata) != 0L) { + cli_warn("y$additional_metadata won't appear in merge result", + class = "epiprocess__epix_merge_ignores_additional_metadata" + ) + } + result_additional_metadata <- list() + + result_clobberable_versions_start <- + if (all(is.na(c(x$clobberable_versions_start, y$clobberable_versions_start)))) { + NA # (any type of NA is fine here) + } else { + min_na_rm(c(x$clobberable_versions_start, y$clobberable_versions_start)) + } + + # The actual merge below may not succeed 100% of the time, so do this + # preprocessing using non-mutating (but potentially aliasing) functions. This + # approach potentially uses more memory, but won't leave behind a + # partially-mutated `x` on failure. + if (sync == "forbid") { + if (!identical(x$versions_end, y$versions_end)) { + cli_abort(paste( + "`x` and `y` were not equally up to date version-wise:", + "`x$versions_end` was not identical to `y$versions_end`;", + "either ensure that `x` and `y` are equally up to date before merging,", + "or specify how to deal with this using `sync`" + ), class = "epiprocess__epix_merge_unresolved_sync") + } else { + new_versions_end <- x$versions_end + x_DT <- x$DT + y_DT <- y$DT + } + } else if (sync %in% c("na", "locf")) { + new_versions_end <- max(x$versions_end, y$versions_end) + x_DT <- epix_fill_through_version2(x, new_versions_end, sync)$DT + y_DT <- epix_fill_through_version2(y, new_versions_end, sync)$DT + } else if (sync == "truncate") { + new_versions_end <- min(x$versions_end, y$versions_end) + x_DT <- x$DT[x[["DT"]][["version"]] <= new_versions_end, names(x$DT), with = FALSE] + y_DT <- y$DT[y[["DT"]][["version"]] <= new_versions_end, names(y$DT), with = FALSE] + } else { + cli_abort("unimplemented") + } + + # key(x_DT) should be the same as key(x$DT) and key(y_DT) should be the same + # as key(y$DT). Below, we only use {x,y}_DT in the code (making it easier to + # split the code into separate functions if we wish), but still refer to + # {x,y}$DT in the error messages (further relying on this assumption). + # + # Check&ensure that the above assumption; if it didn't already hold, we likely + # have a bug in the preprocessing, a weird/invalid archive as input, and/or a + # data.table version with different semantics (which may break other parts of + # our code). + x_DT_key_as_expected <- identical(key(x$DT), key(x_DT)) + y_DT_key_as_expected <- identical(key(y$DT), key(y_DT)) + if (!x_DT_key_as_expected || !y_DT_key_as_expected) { + cli_warn(" + `epiprocess` internal warning (please report): pre-processing for + epix_merge unexpectedly resulted in an intermediate data table (or + tables) with a different key than the corresponding input archive. + Manually setting intermediate data table keys to the expected values. + ", internal = TRUE) + setkeyv(x_DT, key(x$DT)) + setkeyv(y_DT, key(y$DT)) + } + # Without some sort of annotations of what various columns represent, we can't + # do something that makes sense when merging archives with mismatched keys. + # E.g., even if we assume extra keys represent demographic breakdowns, a + # sensible default treatment of count-type and rate-type value columns would + # differ. + if (!identical(sort(key(x_DT)), sort(key(y_DT)))) { + cli_abort(" + The archives must have the same set of key column names; if the + key columns represent the same things, just with different + names, please retry after manually renaming to match; if they + represent different things (e.g., x has an age breakdown + but y does not), please retry after processing them to share + the same key (e.g., by summarizing x to remove the age breakdown, + or by applying a static age breakdown to y). + ", class = "epiprocess__epix_merge_x_y_must_have_same_key_set") + } + # `by` cols = result (and each input's) `key` cols, and determine + # the row set, determined using a full join via `merge` + # + # non-`by` cols = "value"-ish cols, and are looked up with last + # version carried forward via rolling joins + by <- key(x_DT) # = some perm of key(y_DT) + if (!all(c("geo_value", "time_value", "version") %in% key(x_DT))) { + cli_abort('Invalid `by`; `by` is currently set to the common `key` of + the two archives, and is expected to contain + "geo_value", "time_value", and "version".', + class = "epiprocess__epi_archive_must_have_required_key_cols" + ) + } + if (length(by) < 1L || utils::tail(by, 1L) != "version") { + cli_abort('Invalid `by`; `by` is currently set to the common `key` of + the two archives, and is expected to have a "version" as + the last key col.', + class = "epiprocess__epi_archive_must_have_version_at_end_of_key" + ) + } + x_nonby_colnames <- setdiff(names(x_DT), by) + y_nonby_colnames <- setdiff(names(y_DT), by) + if (length(intersect(x_nonby_colnames, y_nonby_colnames)) != 0L) { + cli_abort(" + `x` and `y` DTs have overlapping non-by column names; + this is currently not supported; please manually fix up first: + any overlapping columns that can are key-like should be + incorporated into the key, and other columns should be renamed. + ", class = "epiprocess__epix_merge_x_y_must_not_have_overlapping_nonby_colnames") + } + x_by_vals <- x_DT[, by, with = FALSE] + if (anyDuplicated(x_by_vals) != 0L) { + cli_abort(" + The `by` columns must uniquely determine rows of `x$DT`; + the `by` is currently set to the common `key` of the two + archives, so this can be resolved by adding key-like columns + to `x`'s key (to get a unique key). + ", class = "epiprocess__epix_merge_by_cols_must_act_as_unique_key") + } + y_by_vals <- y_DT[, by, with = FALSE] + if (anyDuplicated(y_by_vals) != 0L) { + cli_abort(" + The `by` columns must uniquely determine rows of `y$DT`; + the `by` is currently set to the common `key` of the two + archives, so this can be resolved by adding key-like columns + to `y`'s key (to get a unique key). + ", class = "epiprocess__epix_merge_by_cols_must_act_as_unique_key") + } + result_DT <- merge(x_by_vals, y_by_vals, + by = by, + # We must have `all=TRUE` or we may skip updates + # from x and/or y and corrupt the history + all = TRUE, + # We don't want Cartesian products, but the + # by-is-unique-key check above already ensures + # this. (Note that `allow.cartesian=FALSE` doesn't + # actually catch all Cartesian products anyway.) + # Disable superfluous check: + allow.cartesian = TRUE + ) + set( + result_DT, , x_nonby_colnames, + x_DT[result_DT[, by, with = FALSE], x_nonby_colnames, + with = FALSE, + # It's good practice to specify `on`, and we must + # explicitly specify `on` if there's a potential key vs. + # by order mismatch (not possible currently for x + # with by = key(x$DT), but possible for y): + on = by, + # last version carried forward: + roll = TRUE, + # requesting non-version key that doesn't exist in the other archive, + # or before its first version, should result in NA + nomatch = NA, + # see note on `allow.cartesian` above; currently have a + # similar story here. + allow.cartesian = TRUE + ] + ) + set( + result_DT, , y_nonby_colnames, + y_DT[result_DT[, by, with = FALSE], y_nonby_colnames, + with = FALSE, + on = by, + roll = TRUE, + nomatch = NA, + allow.cartesian = TRUE + ] + ) + # The key could be unset in case of a key vs. by order mismatch as + # noted above. Ensure that we keep it: + setkeyv(result_DT, by) + + return(as_epi_archive2( + result_DT[], # clear data.table internal invisibility flag if set + geo_type = x$geo_type, + time_type = x$time_type, + other_keys = setdiff(key(result_DT), c("geo_value", "time_value", "version")), + additional_metadata = result_additional_metadata, + # It'd probably be better to pre-compactify before the merge, and might be + # guaranteed not to be necessary to compactify the merge result if the + # inputs are already compactified, but at time of writing we don't have + # compactify in its own method or field, and it seems like it should be + # pretty fast anyway. + compactify = compactify, + clobberable_versions_start = result_clobberable_versions_start, + versions_end = new_versions_end + )) +} + +# Helpers for `group_by`: + +#' Make non-testing mock to get [`dplyr::dplyr_col_modify`] input +#' +#' A workaround for `dplyr:::mutate_cols` not being exported and directly +#' applying test mock libraries likely being impossible (due to mocking another +#' package's S3 generic or method). +#' +#' Use solely with a single call to the [`dplyr::mutate`] function and then +#' `destructure_col_modify_recorder_df`; other applicable operations from +#' [dplyr::dplyr_extending] have not been implemented. +#' +#' @param parent_df the "parent class" data frame to wrap +#' @return a `col_modify_recorder_df` +#' +#' @noRd +new_col_modify_recorder_df <- function(parent_df) { + assert_class(parent_df, "data.frame") + `class<-`(parent_df, c("col_modify_recorder_df", class(parent_df))) +} + +#' Extract unchanged parent-class data frame from a `new_col_modify_recorder_df` +#' +#' @param col_modify_recorder_df an instance of a `col_modify_recorder_df` +#' @return named list with elements `unchanged_parent_df`, `cols`; `cols` is the +#' input to [`dplyr::dplyr_col_modify`] that this class was designed to record +#' +#' @noRd +destructure_col_modify_recorder_df <- function(col_modify_recorder_df) { + assert_class(col_modify_recorder_df, "col_modify_recorder_df") + list( + unchanged_parent_df = col_modify_recorder_df %>% + `attr<-`("epiprocess::col_modify_recorder_df::cols", NULL) %>% + `class<-`(setdiff(class(.), "col_modify_recorder_df")), + cols = attr(col_modify_recorder_df, + "epiprocess::col_modify_recorder_df::cols", + exact = TRUE + ) + ) +} + +#' `dplyr_col_modify` method that simply records the `cols` argument +#' +#' Must export S3 methods in R >= 4.0, even if they're only designed to be +#' package internals, and must import any corresponding upstream S3 generic +#' functions: +#' @importFrom dplyr dplyr_col_modify +#' @export +#' @noRd +dplyr_col_modify.col_modify_recorder_df <- function(data, cols) { + if (!is.null(attr(data, "epiprocess::col_modify_recorder_df::cols", exact = TRUE))) { + cli_abort("`col_modify_recorder_df` can only record `cols` once", + internal = TRUE + ) + } + attr(data, "epiprocess::col_modify_recorder_df::cols") <- cols + data +} + +#' A more detailed but restricted `mutate` for use in `group_by.epi_archive` +#' +#' More detailed: provides the names of the "requested" columns in addition to +#' the output expected from a regular `mutate` method. +#' +#' Restricted: doesn't allow replacing or removing key cols, where a sort is +#' potentially required at best and what the output key should be is unclear at +#' worst. (The originally expected restriction was that the `mutate` parameters +#' not present in `group_by` would not be recognized, but the current +#' implementation just lets `mutate` handle these even anyway, even if they're +#' not part of the regular `group_by` parameters; these arguments would have to +#' be passed by names with dot prefixes, so just hope that the user means to use +#' them here if provided.) +#' +#' This can introduce column-level aliasing in `data.table`s, which isn't really +#' intended in the `data.table` user model but we can make it part of our user +#' model (see +#' https://stackoverflow.com/questions/45925482/make-a-shallow-copy-in-data-table +#' and links). +#' +#' Don't export this without cleaning up language of "mutate" as in side effects +#' vs. "mutate" as in `dplyr::mutate`. +#' @noRd +epix_detailed_restricted_mutate2 <- function(.data, ...) { + # We don't want to directly use `dplyr::mutate` on the `$DT`, as: + # - `mutate` behavior, including the output class, changes depending on + # whether `dtplyr` < 1.3.0 is loaded and would require post-processing + # - behavior with `dtplyr` isn't fully compatible + # - it doesn't give the desired details, and `rlang::exprs_auto_name` does not + # appropriately handle the `= NULL` and `= ` tidyeval cases + # Instead: + # - Use `as.list` to get a shallow copy (undocumented, but apparently + # intended, behavior), then `as_tibble` (also shallow, given a list) to get + # back to something that will use `dplyr`'s included `mutate` method(s), + # then convert this using shallow operations into a `data.table`. + # - Use `col_modify_recorder_df` to get the desired details. + in_tbl <- tibble::as_tibble(as.list(.data$DT), .name_repair = "minimal") + col_modify_cols <- + destructure_col_modify_recorder_df( + mutate(new_col_modify_recorder_df(in_tbl), ...) + )[["cols"]] + invalidated_key_col_is <- + which(purrr::map_lgl(key(.data$DT), function(key_colname) { + key_colname %in% names(col_modify_cols) && + !rlang::is_reference(in_tbl[[key_colname]], col_modify_cols[[key_colname]]) + })) + if (length(invalidated_key_col_is) != 0L) { + rlang::abort(paste_lines(c( + "Key columns must not be replaced or removed.", + wrap_varnames(key(.data$DT)[invalidated_key_col_is], + initial = "Flagged key cols: " + ) + ))) + } else { + # Have `dplyr` do the `dplyr_col_modify`, keeping the column-level-aliasing + # and must-copy-on-write-if-refcount-more-than-1 model, obtaining a tibble, + # then convert it into a `data.table`. The key should still be valid + # (assuming that the user did not explicitly alter `key(.data$DT)` or the + # columns by reference somehow within `...` tidyeval-style computations, or + # trigger refcount-1 alterations due to still having >1 refcounts on the + # columns), set the "sorted" attribute accordingly to prevent attempted + # sorting (including potential extra copies) or sortedness checking, then + # `setDT` (rather than `as.data.table`, in order to prevent column copying + # to establish ownership according to `data.table`'s memory model). + out_DT <- dplyr::dplyr_col_modify(in_tbl, col_modify_cols) %>% + data.table::setattr("sorted", data.table::key(.data$DT)) %>% + data.table::setDT(key = key(.data$DT)) + out_archive <- clone(.data) + out_archive$DT <- out_DT + request_names <- names(col_modify_cols) + return(list( + archive = out_archive, + request_names = request_names + )) + # (We might also consider special-casing when `mutate` hands back something + # equivalent (in some sense) to the input (probably only encountered when + # we're dealing with `group_by`), and using just `$DT`, not a shallow copy, + # in the result, primarily in order to hedge against `as.list` or `setDT` + # changing their behavior and generating deep copies somehow. This could + # also prevent storage, and perhaps also generation, of shallow copies, but + # this seems unlikely to be a major gain unless it helps enable some + # in-place modifications of refcount-1 columns (although detecting this case + # seems to be common across `group_by` implementations; maybe there is + # something there).) + } +} + + +#' Slide a function over variables in an `epi_archive` or `grouped_epi_archive` +#' +#' Slides a given function over variables in an `epi_archive` object. This +#' behaves similarly to `epi_slide()`, with the key exception that it is +#' version-aware: the sliding computation at any given reference time t is +#' performed on **data that would have been available as of t**. See the +#' [archive +#' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for +#' examples. +#' +#' @param x An [`epi_archive`] or [`grouped_epi_archive`] object. If ungrouped, +#' all data in `x` will be treated as part of a single data group. +#' @param f Function, formula, or missing; together with `...` specifies the +#' computation to slide. To "slide" means to apply a computation over a +#' sliding (a.k.a. "rolling") time window for each data group. The window is +#' determined by the `before` parameter described below. One time step is +#' typically one day or one week; see [`epi_slide`] details for more +#' explanation. If a function, `f` must take an `epi_df` with the same +#' column names as the archive's `DT`, minus the `version` column; followed +#' by a one-row tibble containing the values of the grouping variables for +#' the associated group; followed by a reference time value, usually as a +#' `Date` object; followed by any number of named arguments. If a formula, +#' `f` can operate directly on columns accessed via `.x$var` or `.$var`, as +#' in `~ mean (.x$var)` to compute a mean of a column `var` for each +#' group-`ref_time_value` combination. The group key can be accessed via +#' `.y` or `.group_key`, and the reference time value can be accessed via +#' `.z` or `.ref_time_value`. If `f` is missing, then `...` will specify the +#' computation. +#' @param ... Additional arguments to pass to the function or formula specified +#' via `f`. Alternatively, if `f` is missing, then `...` is interpreted as an +#' expression for tidy evaluation; in addition to referring to columns +#' directly by name, the expression has access to `.data` and `.env` pronouns +#' as in `dplyr` verbs, and can also refer to the `.group_key` and +#' `.ref_time_value`. See details of [`epi_slide`]. +#' @param before How far `before` each `ref_time_value` should the sliding +#' window extend? If provided, should be a single, non-NA, +#' [integer-compatible][vctrs::vec_cast] number of time steps. This window +#' endpoint is inclusive. For example, if `before = 7`, and one time step is +#' one day, then to produce a value for a `ref_time_value` of January 8, we +#' apply the given function or formula to data (for each group present) with +#' `time_value`s from January 1 onward, as they were reported on January 8. +#' For typical disease surveillance sources, this will not include any data +#' with a `time_value` of January 8, and, depending on the amount of reporting +#' latency, may not include January 7 or even earlier `time_value`s. (If +#' instead the archive were to hold nowcasts instead of regular surveillance +#' data, then we would indeed expect data for `time_value` January 8. If it +#' were to hold forecasts, then we would expect data for `time_value`s after +#' January 8, and the sliding window would extend as far after each +#' `ref_time_value` as needed to include all such `time_value`s.) +#' @param ref_time_values Reference time values / versions for sliding +#' computations; each element of this vector serves both as the anchor point +#' for the `time_value` window for the computation and the `max_version` +#' `as_of` which we fetch data in this window. If missing, then this will set +#' to a regularly-spaced sequence of values set to cover the range of +#' `version`s in the `DT` plus the `versions_end`; the spacing of values will +#' be guessed (using the GCD of the skips between values). +#' @param time_step Optional function used to define the meaning of one time +#' step, which if specified, overrides the default choice based on the +#' `time_value` column. This function must take a positive integer and return +#' an object of class `lubridate::period`. For example, we can use `time_step +#' = lubridate::hours` in order to set the time step to be one hour (this +#' would only be meaningful if `time_value` is of class `POSIXct`). +#' @param new_col_name String indicating the name of the new column that will +#' contain the derivative values. Default is "slide_value"; note that setting +#' `new_col_name` equal to an existing column name will overwrite this column. +#' @param as_list_col Should the slide results be held in a list column, or be +#' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, +#' in which case a list object returned by `f` would be unnested (using +#' [`tidyr::unnest()`]), and, if the slide computations output data frames, +#' the names of the resulting columns are given by prepending `new_col_name` +#' to the names of the list elements. +#' @param names_sep String specifying the separator to use in `tidyr::unnest()` +#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix +#' from `new_col_name` entirely. +#' @param all_versions (Not the same as `all_rows` parameter of `epi_slide`.) If +#' `all_versions = TRUE`, then `f` will be passed the version history (all +#' `version <= ref_time_value`) for rows having `time_value` between +#' `ref_time_value - before` and `ref_time_value`. Otherwise, `f` will be +#' passed only the most recent `version` for every unique `time_value`. +#' Default is `FALSE`. +#' @return A tibble whose columns are: the grouping variables, `time_value`, +#' containing the reference time values for the slide computation, and a +#' column named according to the `new_col_name` argument, containing the slide +#' values. +#' +#' @details A few key distinctions between the current function and `epi_slide()`: +#' 1. In `f` functions for `epix_slide`, one should not assume that the input +#' data to contain any rows with `time_value` matching the computation's +#' `ref_time_value` (accessible via `attributes()$metadata$as_of`); for +#' typical epidemiological surveillance data, observations pertaining to a +#' particular time period (`time_value`) are first reported `as_of` some +#' instant after that time period has ended. +#' 2. `epix_slide()` doesn't accept an `after` argument; its windows extend +#' from `before` time steps before a given `ref_time_value` through the last +#' `time_value` available as of version `ref_time_value` (typically, this +#' won't include `ref_time_value` itself, as observations about a particular +#' time interval (e.g., day) are only published after that time interval +#' ends); `epi_slide` windows extend from `before` time steps before a +#' `ref_time_value` through `after` time steps after `ref_time_value`. +#' 3. The input class and columns are similar but different: `epix_slide` +#' (with the default `all_versions=FALSE`) keeps all columns and the +#' `epi_df`-ness of the first argument to each computation; `epi_slide` only +#' provides the grouping variables in the second input, and will convert the +#' first input into a regular tibble if the grouping variables include the +#' essential `geo_value` column. (With `all_versions=TRUE`, `epix_slide` will +#' will provide an `epi_archive` rather than an `epi-df` to each +#' computation.) +#' 4. The output class and columns are similar but different: `epix_slide()` +#' returns a tibble containing only the grouping variables, `time_value`, and +#' the new column(s) from the slide computations, whereas `epi_slide()` +#' returns an `epi_df` with all original variables plus the new columns from +#' the slide computations. (Both will mirror the grouping or ungroupedness of +#' their input, with one exception: `epi_archive`s can have trivial +#' (zero-variable) groupings, but these will be dropped in `epix_slide` +#' results as they are not supported by tibbles.) +#' 5. There are no size stability checks or element/row recycling to maintain +#' size stability in `epix_slide`, unlike in `epi_slide`. (`epix_slide` is +#' roughly analogous to [`dplyr::group_modify`], while `epi_slide` is roughly +#' analogous to `dplyr::mutate` followed by `dplyr::arrange`) This is detailed +#' in the "advanced" vignette. +#' 6. `all_rows` is not supported in `epix_slide`; since the slide +#' computations are allowed more flexibility in their outputs than in +#' `epi_slide`, we can't guess a good representation for missing computations +#' for excluded group-`ref_time_value` pairs. +#' 7. The `ref_time_values` default for `epix_slide` is based on making an +#' evenly-spaced sequence out of the `version`s in the `DT` plus the +#' `versions_end`, rather than the `time_value`s. +#' +#' Apart from the above distinctions, the interfaces between `epix_slide()` and +#' `epi_slide()` are the same. +#' +#' Furthermore, the current function can be considerably slower than +#' `epi_slide()`, for two reasons: (1) it must repeatedly fetch +#' properly-versioned snapshots from the data archive (via its `as_of()` +#' method), and (2) it performs a "manual" sliding of sorts, and does not +#' benefit from the highly efficient `slider` package. For this reason, it +#' should never be used in place of `epi_slide()`, and only used when +#' version-aware sliding is necessary (as it its purpose). +#' +#' Finally, this is simply a wrapper around the `slide()` method of the +#' `epi_archive` and `grouped_epi_archive` classes, so if `x` is an +#' object of either of these classes, then: +#' ``` +#' epix_slide(x, new_var = comp(old_var), before = 119) +#' ``` +#' is equivalent to: +#' ``` +#' x$slide(new_var = comp(old_var), before = 119) +#' ``` +#' +#' Mutation and aliasing: `epix_slide` and `$slide` will not perform in-place +#' mutation of the input archives on their own. In some edge cases the inputs it +#' feeds to the slide computations may alias parts of the input archive, so copy +#' the slide computation inputs if needed before using mutating operations like +#' `data.table`'s `:=` operator. Similarly, in some edge cases, the output of +#' the slide operation may alias parts of the input archive, so similarly, make +#' sure to clone and/or copy appropriately before using in-place mutation. +#' +#' @examples +#' library(dplyr) +#' +#' # Reference time points for which we want to compute slide values: +#' ref_time_values <- seq(as.Date("2020-06-01"), +#' as.Date("2020-06-15"), +#' by = "1 day" +#' ) +#' +#' # A simple (but not very useful) example (see the archive vignette for a more +#' # realistic one): +#' archive_cases_dv_subset_2 %>% +#' group_by(geo_value) %>% +#' epix_slide2( +#' f = ~ mean(.x$case_rate_7d_av), +#' before = 2, +#' ref_time_values = ref_time_values, +#' new_col_name = "case_rate_7d_av_recent_av" +#' ) %>% +#' ungroup() +#' # We requested time windows that started 2 days before the corresponding time +#' # values. The actual number of `time_value`s in each computation depends on +#' # the reporting latency of the signal and `time_value` range covered by the +#' # archive (2020-06-01 -- 2021-11-30 in this example). In this case, we have +#' # * 0 `time_value`s, for ref time 2020-06-01 --> the result is automatically +#' # discarded +#' # * 1 `time_value`, for ref time 2020-06-02 +#' # * 2 `time_value`s, for the rest of the results +#' # * never the 3 `time_value`s we would get from `epi_slide`, since, because +#' # of data latency, we'll never have an observation +#' # `time_value == ref_time_value` as of `ref_time_value`. +#' # The example below shows this type of behavior in more detail. +#' +#' # Examining characteristics of the data passed to each computation with +#' # `all_versions=FALSE`. +#' archive_cases_dv_subset_2 %>% +#' group_by(geo_value) %>% +#' epix_slide2( +#' function(x, gk, rtv) { +#' tibble( +#' time_range = if (nrow(x) == 0L) { +#' "0 `time_value`s" +#' } else { +#' sprintf("%s -- %s", min(x$time_value), max(x$time_value)) +#' }, +#' n = nrow(x), +#' class1 = class(x)[[1L]] +#' ) +#' }, +#' before = 5, all_versions = FALSE, +#' ref_time_values = ref_time_values, names_sep = NULL +#' ) %>% +#' ungroup() %>% +#' arrange(geo_value, time_value) +#' +#' # --- Advanced: --- +#' +#' # `epix_slide` with `all_versions=FALSE` (the default) applies a +#' # version-unaware computation to several versions of the data. We can also +#' # use `all_versions=TRUE` to apply a version-*aware* computation to several +#' # versions of the data, again looking at characteristics of the data passed +#' # to each computation. In this case, each computation should expect an +#' # `epi_archive` containing the relevant version data: +#' +#' archive_cases_dv_subset_2 %>% +#' group_by(geo_value) %>% +#' epix_slide2( +#' function(x, gk, rtv) { +#' tibble( +#' versions_start = if (nrow(x$DT) == 0L) { +#' "NA (0 rows)" +#' } else { +#' toString(min(x$DT$version)) +#' }, +#' versions_end = x$versions_end, +#' time_range = if (nrow(x$DT) == 0L) { +#' "0 `time_value`s" +#' } else { +#' sprintf("%s -- %s", min(x$DT$time_value), max(x$DT$time_value)) +#' }, +#' n = nrow(x$DT), +#' class1 = class(x)[[1L]] +#' ) +#' }, +#' before = 5, all_versions = TRUE, +#' ref_time_values = ref_time_values, names_sep = NULL +#' ) %>% +#' ungroup() %>% +#' # Focus on one geo_value so we can better see the columns above: +#' filter(geo_value == "ca") %>% +#' select(-geo_value) +#' +#' @importFrom rlang enquo !!! +#' @export +epix_slide2 <- function(x, f, ..., before, ref_time_values, + time_step, new_col_name = "slide_value", + as_list_col = FALSE, names_sep = "_", + all_versions = FALSE) { + if (!is_epi_archive2(x, grouped_okay = TRUE)) { + cli_abort("`x` must be of class `epi_archive` or `grouped_epi_archive`.") + } + return(slide(x, f, ..., + before = before, + ref_time_values = ref_time_values, + time_step = time_step, + new_col_name = new_col_name, + as_list_col = as_list_col, + names_sep = names_sep, + all_versions = all_versions + )) +} + + +#' Filter an `epi_archive` object to keep only older versions +#' +#' Generates a filtered `epi_archive` from an `epi_archive` object, keeping +#' only rows with `version` falling on or before a specified date. +#' +#' @param x An `epi_archive` object +#' @param max_version Time value specifying the max version to permit in the +#' filtered archive. That is, the output archive will comprise rows of the +#' current archive data having `version` less than or equal to the +#' specified `max_version` +#' @return An `epi_archive` object +#' +#' @export +epix_truncate_versions_after <- function(x, max_version) { + UseMethod("epix_truncate_versions_after") +} + +#' @export +epix_truncate_versions_after.epi_archive2 <- function(x, max_version) { + cloned_epi_archive <- clone(x) + return((truncate_versions_after(x, max_version))) + # ^ second set of parens drops invisibility +} diff --git a/man/as_epi_archive2.Rd b/man/as_epi_archive2.Rd new file mode 100644 index 00000000..bc3f5185 --- /dev/null +++ b/man/as_epi_archive2.Rd @@ -0,0 +1,142 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{as_epi_archive2} +\alias{as_epi_archive2} +\title{Convert to \code{epi_archive} format} +\usage{ +as_epi_archive2( + x, + geo_type, + time_type, + other_keys, + additional_metadata = list(), + compactify = NULL, + clobberable_versions_start = NA, + versions_end = max_version_with_row_in(x) +) +} +\arguments{ +\item{x}{A data frame, data table, or tibble, with columns \code{geo_value}, +\code{time_value}, \code{version}, and then any additional number of columns.} + +\item{geo_type}{Type for the geo values. If missing, then the function will +attempt to infer it from the geo values present; if this fails, then it +will be set to "custom".} + +\item{time_type}{Type for the time values. If missing, then the function will +attempt to infer it from the time values present; if this fails, then it +will be set to "custom".} + +\item{other_keys}{Character vector specifying the names of variables in \code{x} +that should be considered key variables (in the language of \code{data.table}) +apart from "geo_value", "time_value", and "version".} + +\item{additional_metadata}{List of additional metadata to attach to the +\code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} +fields; named entries from the passed list or will be included as well.} + +\item{compactify}{Optional; Boolean or \code{NULL}: should we remove rows that are +considered redundant for the purposes of \code{epi_archive}'s built-in methods +such as \code{as_of}? As these methods use the last version of each observation +carried forward (LOCF) to interpolate between the version data provided, +rows that don't change these LOCF results can potentially be omitted to +save space. \code{TRUE} will remove these rows, \code{FALSE} will not, and missing or +\code{NULL} will remove these rows and issue a warning. Generally, this can be +set to \code{TRUE}, but if you directly inspect or edit the fields of the +\code{epi_archive} such as its \code{DT}, you will have to determine whether +\code{compactify=TRUE} will produce the desired results. If compactification +here is removing a large proportion of the rows, this may indicate a +potential for space, time, or bandwidth savings upstream the data pipeline, +e.g., when fetching, storing, or preparing the input data \code{x}} + +\item{clobberable_versions_start}{Optional; \code{length}-1; either a value of the +same \code{class} and \code{typeof} as \code{x$version}, or an \code{NA} of any \code{class} and +\code{typeof}: specifically, either (a) the earliest version that could be +subject to "clobbering" (being overwritten with different update data, but +using the \emph{same} version tag as the old update data), or (b) \code{NA}, to +indicate that no versions are clobberable. There are a variety of reasons +why versions could be clobberable under routine circumstances, such as (a) +today's version of one/all of the columns being published after initially +being filled with \code{NA} or LOCF, (b) a buggy version of today's data being +published but then fixed and republished later in the day, or (c) data +pipeline delays (e.g., publisher uploading, periodic scraping, database +syncing, periodic fetching, etc.) that make events (a) or (b) reflected +later in the day (or even on a different day) than expected; potential +causes vary between different data pipelines. The default value is \code{NA}, +which doesn't consider any versions to be clobberable. Another setting that +may be appropriate for some pipelines is \code{max_version_with_row_in(x)}.} + +\item{versions_end}{Optional; length-1, same \code{class} and \code{typeof} as +\code{x$version}: what is the last version we have observed? The default is +\code{max_version_with_row_in(x)}, but values greater than this could also be +valid, and would indicate that we observed additional versions of the data +beyond \code{max(x$version)}, but they all contained empty updates. (The default +value of \code{clobberable_versions_start} does not fully trust these empty +updates, and assumes that any version \verb{>= max(x$version)} could be +clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} +} +\value{ +An \code{epi_archive} object. +} +\description{ +Converts a data frame, data table, or tibble into an \code{epi_archive} +object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/archive.html}{archive vignette} for +examples. The parameter descriptions below are copied from there +} +\details{ +This simply a wrapper around the \code{new()} method of the \code{epi_archive} +class, so for example: + +\if{html}{\out{
}}\preformatted{x <- as_epi_archive(df, geo_type = "state", time_type = "day") +}\if{html}{\out{
}} + +would be equivalent to: + +\if{html}{\out{
}}\preformatted{x <- epi_archive$new(df, geo_type = "state", time_type = "day") +}\if{html}{\out{
}} +} +\examples{ +# Simple ex. with necessary keys +tib <- tibble::tibble( + geo_value = rep(c("ca", "hi"), each = 5), + time_value = rep(seq(as.Date("2020-01-01"), + by = 1, length.out = 5 + ), times = 2), + version = rep(seq(as.Date("2020-01-02"), + by = 1, length.out = 5 + ), times = 2), + value = rnorm(10, mean = 2, sd = 1) +) + +toy_epi_archive <- tib \%>\% as_epi_archive2( + geo_type = "state", + time_type = "day" +) +toy_epi_archive + +# Ex. with an additional key for county +df <- data.frame( + geo_value = c(replicate(2, "ca"), replicate(2, "fl")), + county = c(1, 3, 2, 5), + time_value = c( + "2020-06-01", + "2020-06-02", + "2020-06-01", + "2020-06-02" + ), + version = c( + "2020-06-02", + "2020-06-03", + "2020-06-02", + "2020-06-03" + ), + cases = c(1, 2, 3, 4), + cases_rate = c(0.01, 0.02, 0.01, 0.05) +) + +x <- df \%>\% as_epi_archive2( + geo_type = "state", + time_type = "day", + other_keys = "county" +) +} diff --git a/man/as_of.epi_archive2.Rd b/man/as_of.epi_archive2.Rd new file mode 100644 index 00000000..21a4cfc1 --- /dev/null +++ b/man/as_of.epi_archive2.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{as_of.epi_archive2} +\alias{as_of.epi_archive2} +\title{As of epi_archive} +\usage{ +\method{as_of}{epi_archive2}(epi_archive, max_version, min_time_value = -Inf, all_versions = FALSE) +} +\arguments{ +\item{epi_archive}{An \code{epi_archive} object} + +\item{max_version}{Version specifying the max version to permit in the +snapshot. That is, the snapshot will comprise the unique rows of the +current archive data that represent the most up-to-date signal values, as +of the specified \code{max_version} (and whose \code{time_value}s are at least +\code{min_time_value}).} + +\item{min_time_value}{Time value specifying the min \code{time_value} to permit in +the snapshot. Default is \code{-Inf}, which effectively means that there is no +minimum considered.} + +\item{all_versions}{Boolean; If \code{all_versions = TRUE}, then the output will be in +\code{epi_archive} format, and contain rows in the specified \code{time_value} range +having \code{version <= max_version}. The resulting object will cover a +potentially narrower \code{version} and \code{time_value} range than \code{x}, depending +on user-provided arguments. Otherwise, there will be one row in the output +for the \code{max_version} of each \code{time_value}. Default is \code{FALSE}.} +} +\description{ +Generates a snapshot in \code{epi_df} format as of a given version. +See the documentation for the wrapper function \code{\link[=epix_as_of]{epix_as_of()}} for +details. The parameter descriptions below are copied from there +} diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 6a25b2af..86e21b89 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -1,9 +1,14 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive.R +% Please edit documentation in R/archive.R, R/archive_new.R \name{epi_archive} \alias{epi_archive} \title{\code{epi_archive} object} \description{ +An \code{epi_archive} is an R6 class which contains a data table +along with several relevant pieces of metadata. The data table can be seen +as the full archive (version history) for some signal variables of +interest. + An \code{epi_archive} is an R6 class which contains a data table along with several relevant pieces of metadata. The data table can be seen as the full archive (version history) for some signal variables of @@ -49,6 +54,56 @@ represent potential update data that we do not yet have access to; or in version in which it was first released, or if no version of that observation appears in the archive data at all. +\strong{A word of caution:} R6 objects, unlike most other objects in R, have +reference semantics. A primary consequence of this is that objects are not +copied when modified. You can read more about this in Hadley Wickham's +\href{https://adv-r.hadley.nz/r6.html#r6-semantics}{Advanced R} book. In order +to construct a modified archive while keeping the original intact, first +make a clone using the \verb{$clone} method, then overwrite the clone's \code{DT} +field with \code{data.table::copy(clone$DT)}, and finally perform the +modifications on the clone. + +epi archive + +An \code{epi_archive} is an R6 class which contains a data table \code{DT}, of +class \code{data.table} from the \code{data.table} package, with (at least) the +following columns: +\itemize{ +\item \code{geo_value}: the geographic value associated with each row of measurements. +\item \code{time_value}: the time value associated with each row of measurements. +\item \code{version}: the time value specifying the version for each row of +measurements. For example, if in a given row the \code{version} is January 15, +2022 and \code{time_value} is January 14, 2022, then this row contains the +measurements of the data for January 14, 2022 that were available one day +later. +} + +The data table \code{DT} has key variables \code{geo_value}, \code{time_value}, \code{version}, +as well as any others (these can be specified when instantiating the +\code{epi_archive} object via the \code{other_keys} argument, and/or set by operating +on \code{DT} directly). Refer to the documentation for \code{\link[=as_epi_archive]{as_epi_archive()}} for +information and examples of relevant parameter names for an \code{epi_archive} object. +Note that there can only be a single row per unique combination of +key variables, and thus the key variables are critical for figuring out how +to generate a snapshot of data from the archive, as of a given version. + +In general, the last version of each observation is carried forward (LOCF) to +fill in data between recorded versions, and between the last recorded +update and the \code{versions_end}. One consequence is that the \code{DT} +doesn't have to contain a full snapshot of every version (although this +generally works), but can instead contain only the rows that are new or +changed from the previous version (see \code{compactify}, which does this +automatically). Currently, deletions must be represented as revising a row +to a special state (e.g., making the entries \code{NA} or including a special +column that flags the data as removed and performing some kind of +post-processing), and the archive is unaware of what this state is. Note +that \code{NA}s \emph{can} be introduced by \code{epi_archive} methods for other reasons, +e.g., in \code{\link{epix_fill_through_version}} and \code{\link{epix_merge}}, if requested, to +represent potential update data that we do not yet have access to; or in +\code{\link{epix_merge}} to represent the "value" of an observation before the +version in which it was first released, or if no version of that +observation appears in the archive data at all. + \strong{A word of caution:} R6 objects, unlike most other objects in R, have reference semantics. A primary consequence of this is that objects are not copied when modified. You can read more about this in Hadley Wickham's @@ -60,6 +115,22 @@ modifications on the clone. } \section{Metadata}{ +The following pieces of metadata are included as fields in an \code{epi_archive} +object: +\itemize{ +\item \code{geo_type}: the type for the geo values. +\item \code{time_type}: the type for the time values. +\item \code{additional_metadata}: list of additional metadata for the data archive. +} + +Unlike an \code{epi_df} object, metadata for an \code{epi_archive} object \code{x} can be +accessed (and altered) directly, as in \code{x$geo_type} or \code{x$time_type}, +etc. Like an \code{epi_df} object, the \code{geo_type} and \code{time_type} fields in the +metadata of an \code{epi_archive} object are not currently used by any +downstream functions in the \code{epiprocess} package, and serve only as useful +bits of information to convey about the data set at hand. + + The following pieces of metadata are included as fields in an \code{epi_archive} object: \itemize{ @@ -78,6 +149,13 @@ bits of information to convey about the data set at hand. \section{Generating Snapshots}{ +An \code{epi_archive} object can be used to generate a snapshot of the data in +\code{epi_df} format, which represents the most up-to-date values of the signal +variables, as of the specified version. This is accomplished by calling the +\code{as_of()} method for an \code{epi_archive} object \code{x}. More details on this +method are documented in the wrapper function \code{\link[=epix_as_of]{epix_as_of()}}. + + An \code{epi_archive} object can be used to generate a snapshot of the data in \code{epi_df} format, which represents the most up-to-date values of the signal variables, as of the specified version. This is accomplished by calling the @@ -87,6 +165,16 @@ method are documented in the wrapper function \code{\link[=epix_as_of]{epix_as_o \section{Sliding Computations}{ +We can run a sliding computation over an \code{epi_archive} object, much like +\code{epi_slide()} does for an \code{epi_df} object. This is accomplished by calling +the \code{slide()} method for an \code{epi_archive} object, which works similarly to +the way \code{epi_slide()} works for an \code{epi_df} object, but with one key +difference: it is version-aware. That is, for an \code{epi_archive} object, the +sliding computation at any given reference time point t is performed on +\strong{data that would have been available as of t}. More details on \code{slide()} +are documented in the wrapper function \code{\link[=epix_slide]{epix_slide()}}. + + We can run a sliding computation over an \code{epi_archive} object, much like \code{epi_slide()} does for an \code{epi_df} object. This is accomplished by calling the \code{slide()} method for an \code{epi_archive} object, which works similarly to @@ -114,6 +202,22 @@ toy_epi_archive <- tib \%>\% epi_archive$new( time_type = "day" ) toy_epi_archive +tib <- tibble::tibble( + geo_value = rep(c("ca", "hi"), each = 5), + time_value = rep(seq(as.Date("2020-01-01"), + by = 1, length.out = 5 + ), times = 2), + version = rep(seq(as.Date("2020-01-02"), + by = 1, length.out = 5 + ), times = 2), + value = rnorm(10, mean = 2, sd = 1) +) + +toy_epi_archive <- tib \%>\% new_epi_archive2( + geo_type = "state", + time_type = "day" +) +toy_epi_archive } \section{Public fields}{ \if{html}{\out{
}} diff --git a/man/epix_as_of2.Rd b/man/epix_as_of2.Rd new file mode 100644 index 00000000..ac69e9a9 --- /dev/null +++ b/man/epix_as_of2.Rd @@ -0,0 +1,95 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_archive_new.R +\name{epix_as_of2} +\alias{epix_as_of2} +\title{Generate a snapshot from an \code{epi_archive} object} +\usage{ +epix_as_of2( + epi_archive, + max_version, + min_time_value = -Inf, + all_versions = FALSE +) +} +\arguments{ +\item{max_version}{Time value specifying the max version to permit in the +snapshot. That is, the snapshot will comprise the unique rows of the +current archive data that represent the most up-to-date signal values, as +of the specified \code{max_version} (and whose time values are at least +\code{min_time_value}.)} + +\item{min_time_value}{Time value specifying the min time value to permit in +the snapshot. Default is \code{-Inf}, which effectively means that there is no +minimum considered.} + +\item{all_versions}{If \code{all_versions = TRUE}, then the output will be in +\code{epi_archive} format, and contain rows in the specified \code{time_value} range +having \code{version <= max_version}. The resulting object will cover a +potentially narrower \code{version} and \code{time_value} range than \code{x}, depending +on user-provided arguments. Otherwise, there will be one row in the output +for the \code{max_version} of each \code{time_value}. Default is \code{FALSE}.} + +\item{x}{An \code{epi_archive} object} +} +\value{ +An \code{epi_df} object. +} +\description{ +Generates a snapshot in \code{epi_df} format from an \code{epi_archive} object, as of a +given version. See the \href{https://cmu-delphi.github.io/epiprocess/articles/archive.html}{archive vignette} for +examples. +} +\details{ +This is simply a wrapper around the \code{as_of()} method of the +\code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then: + +\if{html}{\out{
}}\preformatted{epix_as_of(x, max_version = v) +}\if{html}{\out{
}} + +is equivalent to: + +\if{html}{\out{
}}\preformatted{x$as_of(max_version = v) +}\if{html}{\out{
}} + +Mutation and aliasing: \code{epix_as_of} and \verb{$as_of} will not mutate the input +archives, but may in some edge cases alias parts of the inputs, so copy the +outputs if needed before using mutating operations like \code{data.table}'s \verb{:=} +operator. Currently, the only situation where there is potentially aliasing +is of the \code{DT} in edge cases with \code{all_versions = TRUE}, but this may change +in the future. +} +\examples{ +# warning message of data latency shown +epix_as_of2( + archive_cases_dv_subset_2, + max_version = max(archive_cases_dv_subset_2$DT$version) +) + +range(archive_cases_dv_subset_2$DT$version) # 2020-06-02 -- 2021-12-01 + +epix_as_of2( + archive_cases_dv_subset_2, + max_version = as.Date("2020-06-12") +) + +# When fetching a snapshot as of the latest version with update data in the +# archive, a warning is issued by default, as this update data might not yet +# be finalized (for example, if data versions are labeled with dates, these +# versions might be overwritten throughout the corresponding days with +# additional data or "hotfixes" of erroroneous data; when we build an archive +# based on database queries, the latest available update might still be +# subject to change, but previous versions should be finalized). We can +# muffle such warnings with the following pattern: +withCallingHandlers( + { + epix_as_of2( + archive_cases_dv_subset_2, + max_version = max(archive_cases_dv_subset_2$DT$version) + ) + }, + epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") +) +# Since R 4.0, there is a `globalCallingHandlers` function that can be used +# to globally toggle these warnings. + +} diff --git a/man/epix_fill_through_version2.Rd b/man/epix_fill_through_version2.Rd new file mode 100644 index 00000000..7389388a --- /dev/null +++ b/man/epix_fill_through_version2.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_archive_new.R +\name{epix_fill_through_version2} +\alias{epix_fill_through_version2} +\title{\code{epi_archive} with unobserved history filled in (won't mutate, might alias)} +\usage{ +epix_fill_through_version2( + epi_archive, + fill_versions_end, + how = c("na", "locf") +) +} +\arguments{ +\item{fill_versions_end}{Length-1, same class&type as \code{x$version}: the +version through which to fill in missing version history; this will be the +result's \verb{$versions_end} unless it already had a later +\verb{$versions_end}.} + +\item{how}{Optional; \code{"na"} or \code{"locf"}: \code{"na"} will fill in any missing +required version history with \code{NA}s, by inserting (if necessary) an update +immediately after the current \verb{$versions_end} that revises all +existing measurements to be \code{NA} (this is only supported for \code{version} +classes with a \code{next_after} implementation); \code{"locf"} will fill in missing +version history with the last version of each observation carried forward +(LOCF), by leaving the update \verb{$DT} alone (other \code{epi_archive} methods are +based on LOCF). Default is \code{"na"}.} + +\item{x}{An \code{epi_archive}} +} +\value{ +An \code{epi_archive} +} +\description{ +Sometimes, due to upstream data pipeline issues, we have to work with a +version history that isn't completely up to date, but with functions that +expect archives that are completely up to date, or equally as up-to-date as +another archive. This function provides one way to approach such mismatches: +pretend that we've "observed" additional versions, filling in these versions +with NAs or extrapolated values. +} +\details{ +'\code{epix_fill_through_version} will not mutate its \code{x} argument, but its result +might alias fields of \code{x} (e.g., mutating the result's \code{DT} might mutate +\code{x$DT}). The R6 method variant, \code{x$fill_through_version}, will mutate \code{x} to +give the result, but might reseat its fields (e.g., references to the old +\code{x$DT} might not be updated by this function or subsequent operations on +\code{x}), and returns the updated \code{x} \link[base:invisible]{invisibly}. +} diff --git a/man/epix_merge2.Rd b/man/epix_merge2.Rd new file mode 100644 index 00000000..11d0aff5 --- /dev/null +++ b/man/epix_merge2.Rd @@ -0,0 +1,71 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_archive_new.R +\name{epix_merge2} +\alias{epix_merge2} +\title{Merge two \code{epi_archive} objects} +\usage{ +epix_merge2( + x, + y, + sync = c("forbid", "na", "locf", "truncate"), + compactify = TRUE +) +} +\arguments{ +\item{x, y}{Two \code{epi_archive} objects to join together.} + +\item{sync}{Optional; \code{"forbid"}, \code{"na"}, \code{"locf"}, or \code{"truncate"}; in the +case that \code{x$versions_end} doesn't match \code{y$versions_end}, what do we do?: +\code{"forbid"}: emit an error; "na": use \code{max(x$versions_end, y$versions_end)} +as the result's \code{versions_end}, but ensure that, if we request a snapshot +as of a version after \code{min(x$versions_end, y$versions_end)}, the +observation columns from the less up-to-date archive will be all NAs (i.e., +imagine there was an update immediately after its \code{versions_end} which +revised all observations to be \code{NA}); \code{"locf"}: use \code{max(x$versions_end, y$versions_end)} as the result's \code{versions_end}, allowing the last version +of each observation to be carried forward to extrapolate unavailable +versions for the less up-to-date input archive (i.e., imagining that in the +less up-to-date archive's data set remained unchanged between its actual +\code{versions_end} and the other archive's \code{versions_end}); or \code{"truncate"}: +use \code{min(x$versions_end, y$versions_end)} as the result's \code{versions_end}, +and discard any rows containing update rows for later versions.} + +\item{compactify}{Optional; \code{TRUE}, \code{FALSE}, or \code{NULL}; should the result be +compactified? See \code{\link{as_epi_archive}} for an explanation of what this means. +Default here is \code{TRUE}.} +} +\value{ +the resulting \code{epi_archive} +} +\description{ +Merges two \code{epi_archive}s that share a common \code{geo_value}, \code{time_value}, and +set of key columns. When they also share a common \code{versions_end}, +using \verb{$as_of} on the result should be the same as using \verb{$as_of} on \code{x} and +\code{y} individually, then performing a full join of the \code{DT}s on the non-version +key columns (potentially consolidating multiple warnings about clobberable +versions). If the \code{versions_end} values differ, the +\code{sync} parameter controls what is done. +} +\details{ +This function, \code{\link{epix_merge}}, does not mutate its inputs and will not alias +either archive's \code{DT}, but may alias other fields; \code{x$merge} will overwrite +\code{x} with the result of the merge, reseating its \code{DT} and several other fields +(making them point to different objects), but avoiding mutation of the +contents of the old \code{DT} (only relevant if you have another reference to the +old \code{DT} in another object). + +In all cases, \code{additional_metadata} will be an empty list, and +\code{clobberable_versions_start} will be set to the earliest version that could +be clobbered in either input archive. +} +\examples{ +# create two example epi_archive datasets +x <- archive_cases_dv_subset_2$DT \%>\% + dplyr::select(geo_value, time_value, version, case_rate_7d_av) \%>\% + as_epi_archive2(compactify = TRUE) +y <- archive_cases_dv_subset_2$DT \%>\% + dplyr::select(geo_value, time_value, version, percent_cli) \%>\% + as_epi_archive2(compactify = TRUE) +# merge results stored in a third object: +xy <- epix_merge2(x, y) + +} diff --git a/man/epix_slide2.Rd b/man/epix_slide2.Rd new file mode 100644 index 00000000..8d822bc0 --- /dev/null +++ b/man/epix_slide2.Rd @@ -0,0 +1,283 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_archive_new.R +\name{epix_slide2} +\alias{epix_slide2} +\title{Slide a function over variables in an \code{epi_archive} or \code{grouped_epi_archive}} +\usage{ +epix_slide2( + x, + f, + ..., + before, + ref_time_values, + time_step, + new_col_name = "slide_value", + as_list_col = FALSE, + names_sep = "_", + all_versions = FALSE +) +} +\arguments{ +\item{x}{An \code{\link{epi_archive}} or \code{\link{grouped_epi_archive}} object. If ungrouped, +all data in \code{x} will be treated as part of a single data group.} + +\item{f}{Function, formula, or missing; together with \code{...} specifies the +computation to slide. To "slide" means to apply a computation over a +sliding (a.k.a. "rolling") time window for each data group. The window is +determined by the \code{before} parameter described below. One time step is +typically one day or one week; see \code{\link{epi_slide}} details for more +explanation. If a function, \code{f} must take an \code{epi_df} with the same +column names as the archive's \code{DT}, minus the \code{version} column; followed +by a one-row tibble containing the values of the grouping variables for +the associated group; followed by a reference time value, usually as a +\code{Date} object; followed by any number of named arguments. If a formula, +\code{f} can operate directly on columns accessed via \code{.x$var} or \code{.$var}, as +in \code{~ mean (.x$var)} to compute a mean of a column \code{var} for each +group-\code{ref_time_value} combination. The group key can be accessed via +\code{.y} or \code{.group_key}, and the reference time value can be accessed via +\code{.z} or \code{.ref_time_value}. If \code{f} is missing, then \code{...} will specify the +computation.} + +\item{...}{Additional arguments to pass to the function or formula specified +via \code{f}. Alternatively, if \code{f} is missing, then \code{...} is interpreted as an +expression for tidy evaluation; in addition to referring to columns +directly by name, the expression has access to \code{.data} and \code{.env} pronouns +as in \code{dplyr} verbs, and can also refer to the \code{.group_key} and +\code{.ref_time_value}. See details of \code{\link{epi_slide}}.} + +\item{before}{How far \code{before} each \code{ref_time_value} should the sliding +window extend? If provided, should be a single, non-NA, +\link[vctrs:vec_cast]{integer-compatible} number of time steps. This window +endpoint is inclusive. For example, if \code{before = 7}, and one time step is +one day, then to produce a value for a \code{ref_time_value} of January 8, we +apply the given function or formula to data (for each group present) with +\code{time_value}s from January 1 onward, as they were reported on January 8. +For typical disease surveillance sources, this will not include any data +with a \code{time_value} of January 8, and, depending on the amount of reporting +latency, may not include January 7 or even earlier \code{time_value}s. (If +instead the archive were to hold nowcasts instead of regular surveillance +data, then we would indeed expect data for \code{time_value} January 8. If it +were to hold forecasts, then we would expect data for \code{time_value}s after +January 8, and the sliding window would extend as far after each +\code{ref_time_value} as needed to include all such \code{time_value}s.)} + +\item{ref_time_values}{Reference time values / versions for sliding +computations; each element of this vector serves both as the anchor point +for the \code{time_value} window for the computation and the \code{max_version} +\code{as_of} which we fetch data in this window. If missing, then this will set +to a regularly-spaced sequence of values set to cover the range of +\code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will +be guessed (using the GCD of the skips between values).} + +\item{time_step}{Optional function used to define the meaning of one time +step, which if specified, overrides the default choice based on the +\code{time_value} column. This function must take a positive integer and return +an object of class \code{lubridate::period}. For example, we can use \code{time_step = lubridate::hours} in order to set the time step to be one hour (this +would only be meaningful if \code{time_value} is of class \code{POSIXct}).} + +\item{new_col_name}{String indicating the name of the new column that will +contain the derivative values. Default is "slide_value"; note that setting +\code{new_col_name} equal to an existing column name will overwrite this column.} + +\item{as_list_col}{Should the slide results be held in a list column, or be +\link[tidyr:chop]{unchopped}/\link[tidyr:unnest]{unnested}? Default is \code{FALSE}, +in which case a list object returned by \code{f} would be unnested (using +\code{\link[tidyr:unnest]{tidyr::unnest()}}), and, if the slide computations output data frames, +the names of the resulting columns are given by prepending \code{new_col_name} +to the names of the list elements.} + +\item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} +when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix +from \code{new_col_name} entirely.} + +\item{all_versions}{(Not the same as \code{all_rows} parameter of \code{epi_slide}.) If +\code{all_versions = TRUE}, then \code{f} will be passed the version history (all +\code{version <= ref_time_value}) for rows having \code{time_value} between +\code{ref_time_value - before} and \code{ref_time_value}. Otherwise, \code{f} will be +passed only the most recent \code{version} for every unique \code{time_value}. +Default is \code{FALSE}.} +} +\value{ +A tibble whose columns are: the grouping variables, \code{time_value}, +containing the reference time values for the slide computation, and a +column named according to the \code{new_col_name} argument, containing the slide +values. +} +\description{ +Slides a given function over variables in an \code{epi_archive} object. This +behaves similarly to \code{epi_slide()}, with the key exception that it is +version-aware: the sliding computation at any given reference time t is +performed on \strong{data that would have been available as of t}. See the +\href{https://cmu-delphi.github.io/epiprocess/articles/archive.html}{archive vignette} for +examples. +} +\details{ +A few key distinctions between the current function and \code{epi_slide()}: +\enumerate{ +\item In \code{f} functions for \code{epix_slide}, one should not assume that the input +data to contain any rows with \code{time_value} matching the computation's +\code{ref_time_value} (accessible via \verb{attributes()$metadata$as_of}); for +typical epidemiological surveillance data, observations pertaining to a +particular time period (\code{time_value}) are first reported \code{as_of} some +instant after that time period has ended. +\item \code{epix_slide()} doesn't accept an \code{after} argument; its windows extend +from \code{before} time steps before a given \code{ref_time_value} through the last +\code{time_value} available as of version \code{ref_time_value} (typically, this +won't include \code{ref_time_value} itself, as observations about a particular +time interval (e.g., day) are only published after that time interval +ends); \code{epi_slide} windows extend from \code{before} time steps before a +\code{ref_time_value} through \code{after} time steps after \code{ref_time_value}. +\item The input class and columns are similar but different: \code{epix_slide} +(with the default \code{all_versions=FALSE}) keeps all columns and the +\code{epi_df}-ness of the first argument to each computation; \code{epi_slide} only +provides the grouping variables in the second input, and will convert the +first input into a regular tibble if the grouping variables include the +essential \code{geo_value} column. (With \code{all_versions=TRUE}, \code{epix_slide} will +will provide an \code{epi_archive} rather than an \code{epi-df} to each +computation.) +\item The output class and columns are similar but different: \code{epix_slide()} +returns a tibble containing only the grouping variables, \code{time_value}, and +the new column(s) from the slide computations, whereas \code{epi_slide()} +returns an \code{epi_df} with all original variables plus the new columns from +the slide computations. (Both will mirror the grouping or ungroupedness of +their input, with one exception: \code{epi_archive}s can have trivial +(zero-variable) groupings, but these will be dropped in \code{epix_slide} +results as they are not supported by tibbles.) +\item There are no size stability checks or element/row recycling to maintain +size stability in \code{epix_slide}, unlike in \code{epi_slide}. (\code{epix_slide} is +roughly analogous to \code{\link[dplyr:group_map]{dplyr::group_modify}}, while \code{epi_slide} is roughly +analogous to \code{dplyr::mutate} followed by \code{dplyr::arrange}) This is detailed +in the "advanced" vignette. +\item \code{all_rows} is not supported in \code{epix_slide}; since the slide +computations are allowed more flexibility in their outputs than in +\code{epi_slide}, we can't guess a good representation for missing computations +for excluded group-\code{ref_time_value} pairs. +\item The \code{ref_time_values} default for \code{epix_slide} is based on making an +evenly-spaced sequence out of the \code{version}s in the \code{DT} plus the +\code{versions_end}, rather than the \code{time_value}s. +} + +Apart from the above distinctions, the interfaces between \code{epix_slide()} and +\code{epi_slide()} are the same. + +Furthermore, the current function can be considerably slower than +\code{epi_slide()}, for two reasons: (1) it must repeatedly fetch +properly-versioned snapshots from the data archive (via its \code{as_of()} +method), and (2) it performs a "manual" sliding of sorts, and does not +benefit from the highly efficient \code{slider} package. For this reason, it +should never be used in place of \code{epi_slide()}, and only used when +version-aware sliding is necessary (as it its purpose). + +Finally, this is simply a wrapper around the \code{slide()} method of the +\code{epi_archive} and \code{grouped_epi_archive} classes, so if \code{x} is an +object of either of these classes, then: + +\if{html}{\out{
}}\preformatted{epix_slide(x, new_var = comp(old_var), before = 119) +}\if{html}{\out{
}} + +is equivalent to: + +\if{html}{\out{
}}\preformatted{x$slide(new_var = comp(old_var), before = 119) +}\if{html}{\out{
}} + +Mutation and aliasing: \code{epix_slide} and \verb{$slide} will not perform in-place +mutation of the input archives on their own. In some edge cases the inputs it +feeds to the slide computations may alias parts of the input archive, so copy +the slide computation inputs if needed before using mutating operations like +\code{data.table}'s \verb{:=} operator. Similarly, in some edge cases, the output of +the slide operation may alias parts of the input archive, so similarly, make +sure to clone and/or copy appropriately before using in-place mutation. +} +\examples{ +library(dplyr) + +# Reference time points for which we want to compute slide values: +ref_time_values <- seq(as.Date("2020-06-01"), + as.Date("2020-06-15"), + by = "1 day" +) + +# A simple (but not very useful) example (see the archive vignette for a more +# realistic one): +archive_cases_dv_subset_2 \%>\% + group_by(geo_value) \%>\% + epix_slide2( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = ref_time_values, + new_col_name = "case_rate_7d_av_recent_av" + ) \%>\% + ungroup() +# We requested time windows that started 2 days before the corresponding time +# values. The actual number of `time_value`s in each computation depends on +# the reporting latency of the signal and `time_value` range covered by the +# archive (2020-06-01 -- 2021-11-30 in this example). In this case, we have +# * 0 `time_value`s, for ref time 2020-06-01 --> the result is automatically +# discarded +# * 1 `time_value`, for ref time 2020-06-02 +# * 2 `time_value`s, for the rest of the results +# * never the 3 `time_value`s we would get from `epi_slide`, since, because +# of data latency, we'll never have an observation +# `time_value == ref_time_value` as of `ref_time_value`. +# The example below shows this type of behavior in more detail. + +# Examining characteristics of the data passed to each computation with +# `all_versions=FALSE`. +archive_cases_dv_subset_2 \%>\% + group_by(geo_value) \%>\% + epix_slide2( + function(x, gk, rtv) { + tibble( + time_range = if (nrow(x) == 0L) { + "0 `time_value`s" + } else { + sprintf("\%s -- \%s", min(x$time_value), max(x$time_value)) + }, + n = nrow(x), + class1 = class(x)[[1L]] + ) + }, + before = 5, all_versions = FALSE, + ref_time_values = ref_time_values, names_sep = NULL + ) \%>\% + ungroup() \%>\% + arrange(geo_value, time_value) + +# --- Advanced: --- + +# `epix_slide` with `all_versions=FALSE` (the default) applies a +# version-unaware computation to several versions of the data. We can also +# use `all_versions=TRUE` to apply a version-*aware* computation to several +# versions of the data, again looking at characteristics of the data passed +# to each computation. In this case, each computation should expect an +# `epi_archive` containing the relevant version data: + +archive_cases_dv_subset_2 \%>\% + group_by(geo_value) \%>\% + epix_slide2( + function(x, gk, rtv) { + tibble( + versions_start = if (nrow(x$DT) == 0L) { + "NA (0 rows)" + } else { + toString(min(x$DT$version)) + }, + versions_end = x$versions_end, + time_range = if (nrow(x$DT) == 0L) { + "0 `time_value`s" + } else { + sprintf("\%s -- \%s", min(x$DT$time_value), max(x$DT$time_value)) + }, + n = nrow(x$DT), + class1 = class(x)[[1L]] + ) + }, + before = 5, all_versions = TRUE, + ref_time_values = ref_time_values, names_sep = NULL + ) \%>\% + ungroup() \%>\% + # Focus on one geo_value so we can better see the columns above: + filter(geo_value == "ca") \%>\% + select(-geo_value) + +} diff --git a/man/epix_truncate_versions_after.Rd b/man/epix_truncate_versions_after.Rd index 8f741418..f30be07f 100644 --- a/man/epix_truncate_versions_after.Rd +++ b/man/epix_truncate_versions_after.Rd @@ -1,9 +1,12 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-epi_archive.R +% Please edit documentation in R/methods-epi_archive.R, +% R/methods-epi_archive_new.R \name{epix_truncate_versions_after} \alias{epix_truncate_versions_after} \title{Filter an \code{epi_archive} object to keep only older versions} \usage{ +epix_truncate_versions_after(x, max_version) + epix_truncate_versions_after(x, max_version) } \arguments{ @@ -15,9 +18,14 @@ current archive data having \code{version} less than or equal to the specified \code{max_version}} } \value{ +An \code{epi_archive} object + An \code{epi_archive} object } \description{ +Generates a filtered \code{epi_archive} from an \code{epi_archive} object, keeping +only rows with \code{version} falling on or before a specified date. + Generates a filtered \code{epi_archive} from an \code{epi_archive} object, keeping only rows with \code{version} falling on or before a specified date. } diff --git a/man/epix_truncate_versions_after.grouped_epi_archive2.Rd b/man/epix_truncate_versions_after.grouped_epi_archive2.Rd new file mode 100644 index 00000000..5fba48fb --- /dev/null +++ b/man/epix_truncate_versions_after.grouped_epi_archive2.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/grouped_archive_new.R +\name{epix_truncate_versions_after.grouped_epi_archive2} +\alias{epix_truncate_versions_after.grouped_epi_archive2} +\title{Truncate versions after a given version, grouped} +\usage{ +\method{epix_truncate_versions_after}{grouped_epi_archive2}(grouped_epi_archive, max_version) +} +\description{ +Truncate versions after a given version, grouped +} diff --git a/man/fill_through_version.epi_archive2.Rd b/man/fill_through_version.epi_archive2.Rd new file mode 100644 index 00000000..48afb864 --- /dev/null +++ b/man/fill_through_version.epi_archive2.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{fill_through_version.epi_archive2} +\alias{fill_through_version.epi_archive2} +\title{Fill through version} +\usage{ +\method{fill_through_version}{epi_archive2}(epi_archive, fill_versions_end, how = c("na", "locf")) +} +\arguments{ +\item{epi_archive}{an \code{epi_archive} object} + +\item{fill_versions_end}{as in \code{\link{epix_fill_through_version}}} + +\item{how}{as in \code{\link{epix_fill_through_version}}} +} +\description{ +Fill in unobserved history using requested scheme by mutating +the given object and potentially reseating its fields. See +\code{\link{epix_fill_through_version}}, which doesn't mutate the input archive but +might alias its fields. +} diff --git a/man/group_by.epi_archive.Rd b/man/group_by.epi_archive.Rd index 5e867bf3..f157e834 100644 --- a/man/group_by.epi_archive.Rd +++ b/man/group_by.epi_archive.Rd @@ -1,8 +1,14 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-epi_archive.R, R/grouped_epi_archive.R +% Please edit documentation in R/methods-epi_archive.R, R/grouped_archive_new.R, +% R/grouped_epi_archive.R \name{group_by.epi_archive} \alias{group_by.epi_archive} \alias{grouped_epi_archive} +\alias{group_by.grouped_epi_archive2} +\alias{group_by_drop_default.grouped_epi_archive2} +\alias{groups.grouped_epi_archive2} +\alias{ungroup.grouped_epi_archive2} +\alias{is_grouped_epi_archive2} \alias{group_by.grouped_epi_archive} \alias{groups.grouped_epi_archive} \alias{ungroup.grouped_epi_archive} @@ -12,6 +18,21 @@ \usage{ \method{group_by}{epi_archive}(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) +\method{group_by}{grouped_epi_archive2}( + grouped_epi_archive, + ..., + .add = FALSE, + .drop = dplyr::group_by_drop_default(grouped_epi_archive) +) + +\method{group_by_drop_default}{grouped_epi_archive2}(grouped_epi_archive) + +\method{groups}{grouped_epi_archive2}(grouped_epi_archive) + +\method{ungroup}{grouped_epi_archive2}(grouped_epi_archive, ...) + +is_grouped_epi_archive2(x) + \method{group_by}{grouped_epi_archive}(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) \method{groups}{grouped_epi_archive}(x) diff --git a/man/group_by.epi_archive2.Rd b/man/group_by.epi_archive2.Rd new file mode 100644 index 00000000..fa9040c3 --- /dev/null +++ b/man/group_by.epi_archive2.Rd @@ -0,0 +1,147 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{group_by.epi_archive2} +\alias{group_by.epi_archive2} +\alias{grouped_epi_archive} +\title{\code{group_by} and related methods for \code{epi_archive}, \code{grouped_epi_archive}} +\usage{ +\method{group_by}{epi_archive2}( + epi_archive, + ..., + .add = FALSE, + .drop = dplyr::group_by_drop_default(epi_archive) +) +} +\arguments{ +\item{...}{Similar to \code{\link[dplyr:group_by]{dplyr::group_by}} (see "Details:" for edge cases); +\itemize{ +\item For \code{group_by}: unquoted variable name(s) or other +\link[dplyr:dplyr_data_masking]{"data masking"} expression(s). It's possible to +use \code{\link[dplyr:mutate]{dplyr::mutate}}-like syntax here to calculate new columns on which to +perform grouping, but note that, if you are regrouping an already-grouped +\code{.data} object, the calculations will be carried out ignoring such grouping +(same as \link[dplyr:group_by]{in dplyr}). +\item For \code{ungroup}: either +\itemize{ +\item empty, in order to remove the grouping and output an \code{epi_archive}; or +\item variable name(s) or other \link[dplyr:dplyr_tidy_select]{"tidy-select"} +expression(s), in order to remove the matching variables from the list of +grouping variables, and output another \code{grouped_epi_archive}. +} +}} + +\item{.add}{Boolean. If \code{FALSE}, the default, the output will be grouped by +the variable selection from \code{...} only; if \code{TRUE}, the output will be +grouped by the current grouping variables plus the variable selection from +\code{...}.} + +\item{.drop}{As described in \code{\link[dplyr:group_by]{dplyr::group_by}}; determines treatment of +factor columns.} + +\item{.data}{An \code{epi_archive} or \code{grouped_epi_archive}} + +\item{x}{For \code{groups} or \code{ungroup}: a \code{grouped_epi_archive}; for +\code{is_grouped_epi_archive}: any object} + +\item{.tbl}{(For \code{group_by_drop_default}:) an \code{epi_archive} or +\code{grouped_epi_archive} (\code{epi_archive} dispatches to the S3 default method; +\code{grouped_epi_archive} dispatches its own S3 method)} +} +\description{ +\code{group_by} and related methods for \code{epi_archive}, \code{grouped_epi_archive} +} +\details{ +To match \code{dplyr}, \code{group_by} allows "data masking" (also referred to as +"tidy evaluation") expressions \code{...}, not just column names, in a way similar +to \code{mutate}. Note that replacing or removing key columns with these +expressions is disabled. + +\code{archive \%>\% group_by()} and other expressions that group or regroup by zero +columns (indicating that all rows should be treated as part of one large +group) will output a \code{grouped_epi_archive}, in order to enable the use of +\code{grouped_epi_archive} methods on the result. This is in slight contrast to +the same operations on tibbles and grouped tibbles, which will \emph{not} output a +\code{grouped_df} in these circumstances. + +Using \code{group_by} with \code{.add=FALSE} to override the existing grouping is +disabled; instead, \code{ungroup} first then \code{group_by}. + +Mutation and aliasing: \code{group_by} tries to use a shallow copy of the \code{DT}, +introducing column-level aliasing between its input and its result. This +doesn't follow the general model for most \code{data.table} operations, which +seems to be that, given an nonaliased (i.e., unique) pointer to a +\code{data.table} object, its pointers to its columns should also be nonaliased. +If you mutate any of the columns of either the input or result, first ensure +that it is fine if columns of the other are also mutated, but do not rely on +such behavior to occur. Additionally, never perform mutation on the key +columns at all (except for strictly increasing transformations), as this will +invalidate sortedness assumptions about the rows. + +\code{group_by_drop_default} on (ungrouped) \code{epi_archive}s is expected to dispatch +to \code{group_by_drop_default.default} (but there is a dedicated method for +\code{grouped_epi_archive}s). +} +\examples{ + +grouped_archive <- archive_cases_dv_subset_2 \%>\% group_by(geo_value) + +# `print` for metadata and method listing: +grouped_archive \%>\% print() + +# The primary use for grouping is to perform a grouped `epix_slide`: + +archive_cases_dv_subset_2 \%>\% + group_by(geo_value) \%>\% + epix_slide2( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = as.Date("2020-06-11") + 0:2, + new_col_name = "case_rate_3d_av" + ) \%>\% + ungroup() + +# ----------------------------------------------------------------- + +# Advanced: some other features of dplyr grouping are implemented: + +library(dplyr) +toy_archive <- + tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) + ) \%>\% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value), + version = as.Date(version) + ) \%>\% + as_epi_archive2(other_keys = "age_group") + +# The following are equivalent: +toy_archive \%>\% group_by(geo_value, age_group) +toy_archive \%>\% + group_by(geo_value) \%>\% + group_by(age_group, .add = TRUE) +grouping_cols <- c("geo_value", "age_group") +toy_archive \%>\% group_by(across(all_of(grouping_cols))) + +# And these are equivalent: +toy_archive \%>\% group_by(geo_value) +toy_archive \%>\% + group_by(geo_value, age_group) \%>\% + ungroup(age_group) + +# To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): +toy_archive \%>\% + group_by(geo_value) \%>\% + groups() + +toy_archive \%>\% + group_by(geo_value, age_group, .drop = FALSE) \%>\% + epix_slide2(f = ~ sum(.x$value), before = 20) \%>\% + ungroup() + +} diff --git a/man/is_epi_archive2.Rd b/man/is_epi_archive2.Rd new file mode 100644 index 00000000..df258d3e --- /dev/null +++ b/man/is_epi_archive2.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{is_epi_archive2} +\alias{is_epi_archive2} +\title{Test for \code{epi_archive} format} +\usage{ +is_epi_archive2(x, grouped_okay = FALSE) +} +\arguments{ +\item{x}{An object.} + +\item{grouped_okay}{Optional; Boolean; should a \code{grouped_epi_archive} also +count? Default is \code{FALSE}.} +} +\value{ +\code{TRUE} if the object inherits from \code{epi_archive}. +} +\description{ +Test for \code{epi_archive} format +} +\examples{ +is_epi_archive2(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive) +is_epi_archive2(archive_cases_dv_subset_2) # TRUE + +# By default, grouped_epi_archives don't count as epi_archives, as they may +# support a different set of operations from regular `epi_archives`. This +# behavior can be controlled by `grouped_okay`. +grouped_archive <- archive_cases_dv_subset_2 \%>\% group_by(geo_value) +is_epi_archive2(grouped_archive) # FALSE +is_epi_archive2(grouped_archive, grouped_okay = TRUE) # TRUE + +} +\seealso{ +\code{\link{is_grouped_epi_archive}} +} diff --git a/man/max_version_with_row_in.Rd b/man/max_version_with_row_in.Rd index cca554fa..6f0d35b3 100644 --- a/man/max_version_with_row_in.Rd +++ b/man/max_version_with_row_in.Rd @@ -1,18 +1,25 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive.R +% Please edit documentation in R/archive.R, R/archive_new.R \name{max_version_with_row_in} \alias{max_version_with_row_in} \title{\code{max(x$version)}, with error if \code{x} has 0 rows} \usage{ +max_version_with_row_in(x) + max_version_with_row_in(x) } \arguments{ \item{x}{\code{x} argument of \code{\link{as_epi_archive}}} } \value{ +\code{max(x$version)} if it has any rows; raises error if it has 0 rows or +an \code{NA} version value + \code{max(x$version)} if it has any rows; raises error if it has 0 rows or an \code{NA} version value } \description{ +Exported to make defaults more easily copyable. + Exported to make defaults more easily copyable. } diff --git a/man/merge_epi_archive2.Rd b/man/merge_epi_archive2.Rd new file mode 100644 index 00000000..dd1e671e --- /dev/null +++ b/man/merge_epi_archive2.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{merge_epi_archive2} +\alias{merge_epi_archive2} +\title{Merge epi archive} +\usage{ +merge_epi_archive2( + x, + y, + sync = c("forbid", "na", "locf", "truncate"), + compactify = TRUE +) +} +\arguments{ +\item{x}{as in \code{\link{epix_merge}}} + +\item{y}{as in \code{\link{epix_merge}}} + +\item{sync}{as in \code{\link{epix_merge}}} + +\item{compactify}{as in \code{\link{epix_merge}}} +} +\description{ +Merges another \code{epi_archive} with the current one, mutating the +current one by reseating its \code{DT} and several other fields, but avoiding +mutation of the old \code{DT}; returns the current archive +\link[base:invisible]{invisibly}. See \code{\link{epix_merge}} for a full description +of the non-R6-method version, which does not mutate either archive, and +does not alias either archive's \code{DT}.a +} diff --git a/man/new_epi_archive2.Rd b/man/new_epi_archive2.Rd new file mode 100644 index 00000000..52141190 --- /dev/null +++ b/man/new_epi_archive2.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{new_epi_archive2} +\alias{new_epi_archive2} +\title{New epi archive} +\usage{ +new_epi_archive2( + x, + geo_type = NULL, + time_type = NULL, + other_keys = NULL, + additional_metadata = NULL, + compactify = NULL, + clobberable_versions_start = NA, + versions_end = NULL +) +} +\arguments{ +\item{x}{A data.frame, data.table, or tibble, with columns \code{geo_value}, +\code{time_value}, \code{version}, and then any additional number of columns.} + +\item{geo_type}{Type for the geo values. If missing, then the function will +attempt to infer it from the geo values present; if this fails, then it +will be set to "custom".} + +\item{time_type}{Type for the time values. If missing, then the function will +attempt to infer it from the time values present; if this fails, then it +will be set to "custom".} + +\item{other_keys}{Character vector specifying the names of variables in \code{x} +that should be considered key variables (in the language of \code{data.table}) +apart from "geo_value", "time_value", and "version".} + +\item{additional_metadata}{List of additional metadata to attach to the +\code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} +fields; named entries from the passed list or will be included as well.} + +\item{compactify}{Optional; Boolean or \code{NULL}: should we remove rows that are +considered redundant for the purposes of \code{epi_archive}'s built-in methods +such as \code{as_of}? As these methods use the last version of each observation +carried forward (LOCF) to interpolate between the version data provided, +rows that don't change these LOCF results can potentially be omitted to +save space while maintaining the same behavior (with the help of the +\code{clobberable_versions_start} and \code{versions_end} fields in some edge cases). +\code{TRUE} will remove these rows, \code{FALSE} will not, and missing or \code{NULL} will +remove these rows and issue a warning. Generally, this can be set to +\code{TRUE}, but if you directly inspect or edit the fields of the \code{epi_archive} +such as its \code{DT}, or rely on redundant updates to achieve a certain +behavior of the \code{ref_time_values} default in \code{epix_slide}, you will have to +determine whether \code{compactify=TRUE} will produce the desired results. If +compactification here is removing a large proportion of the rows, this may +indicate a potential for space, time, or bandwidth savings upstream the +data pipeline, e.g., by avoiding fetching, storing, or processing these +rows of \code{x}.} + +\item{clobberable_versions_start}{Optional; as in \code{\link{as_epi_archive}}} + +\item{versions_end}{Optional; as in \code{\link{as_epi_archive}}} +} +\value{ +An \code{epi_archive} object. +} +\description{ +Creates a new \code{epi_archive} object. +} +\details{ +Refer to the documentation for \code{\link[=as_epi_archive]{as_epi_archive()}} for more information +and examples of parameter names. +} diff --git a/man/next_after.Rd b/man/next_after.Rd index 5170e8d9..82fd3ebb 100644 --- a/man/next_after.Rd +++ b/man/next_after.Rd @@ -1,17 +1,23 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive.R +% Please edit documentation in R/archive.R, R/archive_new.R \name{next_after} \alias{next_after} \title{Get the next possible value greater than \code{x} of the same type} \usage{ +next_after(x) + next_after(x) } \arguments{ \item{x}{the starting "value"(s)} } \value{ +same class, typeof, and length as \code{x} + same class, typeof, and length as \code{x} } \description{ +Get the next possible value greater than \code{x} of the same type + Get the next possible value greater than \code{x} of the same type } diff --git a/man/print.epi_archive2.Rd b/man/print.epi_archive2.Rd new file mode 100644 index 00000000..0105c47e --- /dev/null +++ b/man/print.epi_archive2.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{print.epi_archive2} +\alias{print.epi_archive2} +\title{Print information about an \code{epi_archive} object} +\usage{ +\method{print}{epi_archive2}(epi_archive, class = TRUE, methods = TRUE) +} +\arguments{ +\item{class}{Boolean; whether to print the class label header} + +\item{methods}{Boolean; whether to print all available methods of +the archive} +} +\description{ +Print information about an \code{epi_archive} object +} diff --git a/man/slide.epi_archive2.Rd b/man/slide.epi_archive2.Rd new file mode 100644 index 00000000..54db5636 --- /dev/null +++ b/man/slide.epi_archive2.Rd @@ -0,0 +1,101 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{slide.epi_archive2} +\alias{slide.epi_archive2} +\title{Slide over epi archive} +\usage{ +\method{slide}{epi_archive2}( + epi_archive, + f, + ..., + before, + ref_time_values, + time_step, + new_col_name = "slide_value", + as_list_col = FALSE, + names_sep = "_", + all_versions = FALSE +) +} +\arguments{ +\item{f}{Function, formula, or missing; together with \code{...} specifies the +computation to slide. To "slide" means to apply a computation over a +sliding (a.k.a. "rolling") time window for each data group. The window is +determined by the \code{before} parameter described below. One time step is +typically one day or one week; see \code{\link{epi_slide}} details for more +explanation. If a function, \code{f} must take an \code{epi_df} with the same +column names as the archive's \code{DT}, minus the \code{version} column; followed +by a one-row tibble containing the values of the grouping variables for +the associated group; followed by a reference time value, usually as a +\code{Date} object; followed by any number of named arguments. If a formula, +\code{f} can operate directly on columns accessed via \code{.x$var} or \code{.$var}, as +in \code{~ mean (.x$var)} to compute a mean of a column \code{var} for each +group-\code{ref_time_value} combination. The group key can be accessed via +\code{.y} or \code{.group_key}, and the reference time value can be accessed via +\code{.z} or \code{.ref_time_value}. If \code{f} is missing, then \code{...} will specify the +computation.} + +\item{...}{Additional arguments to pass to the function or formula specified +via \code{f}. Alternatively, if \code{f} is missing, then \code{...} is interpreted as an +expression for tidy evaluation; in addition to referring to columns +directly by name, the expression has access to \code{.data} and \code{.env} pronouns +as in \code{dplyr} verbs, and can also refer to the \code{.group_key} and +\code{.ref_time_value}. See details of \code{\link{epi_slide}}.} + +\item{before}{How far \code{before} each \code{ref_time_value} should the sliding +window extend? If provided, should be a single, non-NA, +\link[vctrs:vec_cast]{integer-compatible} number of time steps. This window +endpoint is inclusive. For example, if \code{before = 7}, and one time step is +one day, then to produce a value for a \code{ref_time_value} of January 8, we +apply the given function or formula to data (for each group present) with +\code{time_value}s from January 1 onward, as they were reported on January 8. +For typical disease surveillance sources, this will not include any data +with a \code{time_value} of January 8, and, depending on the amount of reporting +latency, may not include January 7 or even earlier \code{time_value}s. (If +instead the archive were to hold nowcasts instead of regular surveillance +data, then we would indeed expect data for \code{time_value} January 8. If it +were to hold forecasts, then we would expect data for \code{time_value}s after +January 8, and the sliding window would extend as far after each +\code{ref_time_value} as needed to include all such \code{time_value}s.)} + +\item{ref_time_values}{Reference time values / versions for sliding +computations; each element of this vector serves both as the anchor point +for the \code{time_value} window for the computation and the \code{max_version} +\code{as_of} which we fetch data in this window. If missing, then this will set +to a regularly-spaced sequence of values set to cover the range of +\code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will +be guessed (using the GCD of the skips between values).} + +\item{time_step}{Optional function used to define the meaning of one time +step, which if specified, overrides the default choice based on the +\code{time_value} column. This function must take a positive integer and return +an object of class \code{lubridate::period}. For example, we can use \code{time_step = lubridate::hours} in order to set the time step to be one hour (this +would only be meaningful if \code{time_value} is of class \code{POSIXct}).} + +\item{new_col_name}{String indicating the name of the new column that will +contain the derivative values. Default is "slide_value"; note that setting +\code{new_col_name} equal to an existing column name will overwrite this column.} + +\item{as_list_col}{Should the slide results be held in a list column, or be +\link[tidyr:chop]{unchopped}/\link[tidyr:unnest]{unnested}? Default is \code{FALSE}, +in which case a list object returned by \code{f} would be unnested (using +\code{\link[tidyr:unnest]{tidyr::unnest()}}), and, if the slide computations output data frames, +the names of the resulting columns are given by prepending \code{new_col_name} +to the names of the list elements.} + +\item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} +when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix +from \code{new_col_name} entirely.} + +\item{all_versions}{(Not the same as \code{all_rows} parameter of \code{epi_slide}.) If +\code{all_versions = TRUE}, then \code{f} will be passed the version history (all +\code{version <= ref_time_value}) for rows having \code{time_value} between +\code{ref_time_value - before} and \code{ref_time_value}. Otherwise, \code{f} will be +passed only the most recent \code{version} for every unique \code{time_value}. +Default is \code{FALSE}.} +} +\description{ +Slides a given function over variables in an \code{epi_archive} +object. See the documentation for the wrapper function \code{\link[=epix_slide]{epix_slide()}} for +details. The parameter descriptions below are copied from there +} diff --git a/man/slide.grouped_epi_archive2.Rd b/man/slide.grouped_epi_archive2.Rd new file mode 100644 index 00000000..b5aac24c --- /dev/null +++ b/man/slide.grouped_epi_archive2.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/grouped_archive_new.R +\name{slide.grouped_epi_archive2} +\alias{slide.grouped_epi_archive2} +\title{Slide over grouped epi archive} +\usage{ +\method{slide}{grouped_epi_archive2}( + grouped_epi_archive, + f, + ..., + before, + ref_time_values, + time_step, + new_col_name = "slide_value", + as_list_col = FALSE, + names_sep = "_", + all_versions = FALSE +) +} +\description{ +Slides a given function over variables in a \code{grouped_epi_archive} +object. See the documentation for the wrapper function \code{\link[=epix_slide]{epix_slide()}} for +details. +} diff --git a/man/truncate_versions_after.epi_archive2.Rd b/man/truncate_versions_after.epi_archive2.Rd new file mode 100644 index 00000000..08ae40d4 --- /dev/null +++ b/man/truncate_versions_after.epi_archive2.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive_new.R +\name{truncate_versions_after.epi_archive2} +\alias{truncate_versions_after.epi_archive2} +\title{Truncate versions after} +\usage{ +\method{truncate_versions_after}{epi_archive2}(epi_archive, max_version) +} +\arguments{ +\item{epi_archive}{as in \code{\link{epix_truncate_versions_after}}} + +\item{max_version}{as in \code{\link{epix_truncate_versions_after}}} +} +\description{ +Filter to keep only older versions, mutating the archive by +potentially reseating but not mutating some fields. \code{DT} is likely, but not +guaranteed, to be copied. Returns the mutated archive +\link[base:invisible]{invisibly}. +} diff --git a/man/truncate_versions_after.grouped_epi_archive2.Rd b/man/truncate_versions_after.grouped_epi_archive2.Rd new file mode 100644 index 00000000..7c25950f --- /dev/null +++ b/man/truncate_versions_after.grouped_epi_archive2.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/grouped_archive_new.R +\name{truncate_versions_after.grouped_epi_archive2} +\alias{truncate_versions_after.grouped_epi_archive2} +\title{Truncate versions after a given version, grouped} +\usage{ +\method{truncate_versions_after}{grouped_epi_archive2}(grouped_epi_archive, max_version) +} +\arguments{ +\item{max_version}{as in \code{\link{epix_truncate_versions_after}}} + +\item{x}{as in \code{\link{epix_truncate_versions_after}}} +} +\description{ +Filter to keep only older versions by mutating the underlying +\code{epi_archive} using \verb{$truncate_versions_after}. Returns the mutated +\code{grouped_epi_archive} \link[base:invisible]{invisibly}. +} diff --git a/tests/testthat/test-archive_new.R b/tests/testthat/test-archive_new.R new file mode 100644 index 00000000..98f708d7 --- /dev/null +++ b/tests/testthat/test-archive_new.R @@ -0,0 +1,173 @@ +library(dplyr) + +test_that("first input must be a data.frame", { + expect_error(as_epi_archive2(c(1, 2, 3), compactify = FALSE), + regexp = "Must be of type 'data.frame'." + ) +}) + +dt <- archive_cases_dv_subset_2$DT + +test_that("data.frame must contain geo_value, time_value and version columns", { + expect_error(as_epi_archive2(select(dt, -geo_value), compactify = FALSE), + regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + ) + expect_error(as_epi_archive2(select(dt, -time_value), compactify = FALSE), + regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + ) + expect_error(as_epi_archive2(select(dt, -version), compactify = FALSE), + regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + ) +}) + +test_that("other_keys can only contain names of the data.frame columns", { + expect_error(as_epi_archive2(dt, other_keys = "xyz", compactify = FALSE), + regexp = "`other_keys` must be contained in the column names of `x`." + ) + expect_error(as_epi_archive2(dt, other_keys = "percent_cli", compactify = FALSE), NA) +}) + +test_that("other_keys cannot contain names geo_value, time_value or version", { + expect_error(as_epi_archive2(dt, other_keys = "geo_value", compactify = FALSE), + regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." + ) + expect_error(as_epi_archive2(dt, other_keys = "time_value", compactify = FALSE), + regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." + ) + expect_error(as_epi_archive2(dt, other_keys = "version", compactify = FALSE), + regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." + ) +}) + +test_that("Warning thrown when other_metadata contains overlapping names with geo_type or time_type fields", { + expect_warning(as_epi_archive2(dt, additional_metadata = list(geo_type = 1), compactify = FALSE), + regexp = "`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\"." + ) + expect_warning(as_epi_archive2(dt, additional_metadata = list(time_type = 1), compactify = FALSE), + regexp = "`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\"." + ) +}) + +test_that("epi_archives are correctly instantiated with a variety of data types", { + # Data frame + df <- data.frame( + geo_value = "ca", + time_value = as.Date("2020-01-01"), + version = as.Date("2020-01-01") + 0:19, + value = 1:20 + ) + + ea1 <- as_epi_archive2(df, compactify = FALSE) + expect_equal(key(ea1$DT), c("geo_value", "time_value", "version")) + expect_equal(ea1$additional_metadata, list()) + + ea2 <- as_epi_archive2(df, other_keys = "value", additional_metadata = list(value = df$value), compactify = FALSE) + expect_equal(key(ea2$DT), c("geo_value", "time_value", "value", "version")) + expect_equal(ea2$additional_metadata, list(value = df$value)) + + # Tibble + tib <- tibble::tibble(df, code = "x") + + ea3 <- as_epi_archive2(tib, compactify = FALSE) + expect_equal(key(ea3$DT), c("geo_value", "time_value", "version")) + expect_equal(ea3$additional_metadata, list()) + + ea4 <- as_epi_archive2(tib, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) + expect_equal(key(ea4$DT), c("geo_value", "time_value", "code", "version")) + expect_equal(ea4$additional_metadata, list(value = df$value)) + + # Keyed data.table + kdt <- data.table::data.table( + geo_value = "ca", + time_value = as.Date("2020-01-01"), + version = as.Date("2020-01-01") + 0:19, + value = 1:20, + code = "CA", + key = "code" + ) + + ea5 <- as_epi_archive2(kdt, compactify = FALSE) + # Key from data.table isn't absorbed when as_epi_archive2 is used + expect_equal(key(ea5$DT), c("geo_value", "time_value", "version")) + expect_equal(ea5$additional_metadata, list()) + + ea6 <- as_epi_archive2(kdt, other_keys = "value", additional_metadata = list(value = df$value), compactify = FALSE) + # Mismatched keys, but the one from as_epi_archive2 overrides + expect_equal(key(ea6$DT), c("geo_value", "time_value", "value", "version")) + expect_equal(ea6$additional_metadata, list(value = df$value)) + + # Unkeyed data.table + udt <- data.table::data.table( + geo_value = "ca", + time_value = as.Date("2020-01-01"), + version = as.Date("2020-01-01") + 0:19, + value = 1:20, + code = "CA" + ) + + ea7 <- as_epi_archive2(udt, compactify = FALSE) + expect_equal(key(ea7$DT), c("geo_value", "time_value", "version")) + expect_equal(ea7$additional_metadata, list()) + + ea8 <- as_epi_archive2(udt, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) + expect_equal(key(ea8$DT), c("geo_value", "time_value", "code", "version")) + expect_equal(ea8$additional_metadata, list(value = df$value)) + + # epi_df + edf1 <- jhu_csse_daily_subset %>% + select(geo_value, time_value, cases) %>% + mutate(version = max(time_value), code = "USA") + + ea9 <- as_epi_archive2(edf1, compactify = FALSE) + expect_equal(key(ea9$DT), c("geo_value", "time_value", "version")) + expect_equal(ea9$additional_metadata, list()) + + ea10 <- as_epi_archive2(edf1, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) + expect_equal(key(ea10$DT), c("geo_value", "time_value", "code", "version")) + expect_equal(ea10$additional_metadata, list(value = df$value)) + + # Keyed epi_df + edf2 <- data.frame( + geo_value = "al", + time_value = rep(as.Date("2020-01-01") + 0:9, 2), + version = c( + rep(as.Date("2020-01-25"), 10), + rep(as.Date("2020-01-26"), 10) + ), + cases = 1:20, + misc = "USA" + ) %>% + as_epi_df(additional_metadata = list(other_keys = "misc")) + + ea11 <- as_epi_archive2(edf2, compactify = FALSE) + expect_equal(key(ea11$DT), c("geo_value", "time_value", "version")) + expect_equal(ea11$additional_metadata, list()) + + ea12 <- as_epi_archive2(edf2, other_keys = "misc", additional_metadata = list(value = df$misc), compactify = FALSE) + expect_equal(key(ea12$DT), c("geo_value", "time_value", "misc", "version")) + expect_equal(ea12$additional_metadata, list(value = df$misc)) +}) + +test_that("`epi_archive` rejects nonunique keys", { + toy_update_tbl <- + tibble::tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130, + "us", "pediatric", "2000-01-01", "2000-01-02", 5 + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value), + version = as.Date(version) + ) + expect_error( + as_epi_archive2(toy_update_tbl), + class = "epiprocess__epi_archive_requires_unique_key" + ) + expect_error( + regexp = NA, + as_epi_archive2(toy_update_tbl, other_keys = "age_group"), + ) +}) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 4400c94a..58e97884 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -2,7 +2,7 @@ library(epiprocess) library(data.table) library(dplyr) -dt <- archive_cases_dv_subset$DT +dt <- archive_cases_dv_subset_2$DT dt <- filter(dt, geo_value == "ca") %>% filter(version <= "2020-06-15") %>% select(-case_rate_7d_av) diff --git a/tests/testthat/test-compactify_new.R b/tests/testthat/test-compactify_new.R new file mode 100644 index 00000000..cd53913d --- /dev/null +++ b/tests/testthat/test-compactify_new.R @@ -0,0 +1,110 @@ +library(epiprocess) +library(data.table) +library(dplyr) + +dt <- archive_cases_dv_subset_2$DT +dt <- filter(dt, geo_value == "ca") %>% + filter(version <= "2020-06-15") %>% + select(-case_rate_7d_av) + +test_that("Input for compactify must be NULL or a boolean", { + expect_error(as_epi_archive2(dt, compactify = "no")) +}) + +dt$percent_cli <- c(1:80) +dt$case_rate <- c(1:80) + +row_replace <- function(dt, row, x, y) { + # (This way of "replacing" elements appears to use copy-on-write even though + # we are working with a data.table.) + dt[row, 4] <- x + dt[row, 5] <- y + dt +} + +# Note that compactify is working on version-wise LOCF (last version of each +# observation carried forward) + +# Rows 1 should not be eliminated even if NA +dt <- row_replace(dt, 1, NA, NA) # Not LOCF + +# NOTE! We are assuming that there are no NA's in geo_value, time_value, +# and version. Even though compactify may erroneously remove the first row +# if it has all NA's, we are not testing this behaviour for now as this dataset +# has problems beyond the scope of this test + +# Rows 11 and 12 correspond to different time_values +dt <- row_replace(dt, 12, 11, 11) # Not LOCF + +# Rows 20 and 21 only differ in version +dt <- row_replace(dt, 21, 20, 20) # LOCF + +# Rows 21 and 22 only differ in version +dt <- row_replace(dt, 22, 20, 20) # LOCF + +# Row 39 comprises the first NA's +dt <- row_replace(dt, 39, NA, NA) # Not LOCF + +# Row 40 has two NA's, just like its lag, row 39 +dt <- row_replace(dt, 40, NA, NA) # LOCF + +# Row 62's values already exist in row 15, but row 15 is not a preceding row +dt <- row_replace(dt, 62, 15, 15) # Not LOCF + +# Row 73 only has one value carried over +dt <- row_replace(dt, 74, 73, 74) # Not LOCF + +dt_true <- as_tibble(as_epi_archive2(dt, compactify = TRUE)$DT) +dt_false <- as_tibble(as_epi_archive2(dt, compactify = FALSE)$DT) +dt_null <- suppressWarnings(as_tibble(as_epi_archive2(dt, compactify = NULL)$DT)) + +test_that("Warning for LOCF with compactify as NULL", { + expect_warning(as_epi_archive2(dt, compactify = NULL)) +}) + +test_that("No warning when there is no LOCF", { + expect_warning(as_epi_archive2(dt[1:5], compactify = NULL), NA) +}) + +test_that("LOCF values are ignored with compactify=FALSE", { + expect_identical(nrow(dt), nrow(dt_false)) +}) + +test_that("LOCF values are taken out with compactify=TRUE", { + dt_test <- as_tibble(as_epi_archive2(dt[-c(21, 22, 40), ], compactify = FALSE)$DT) + + expect_identical(dt_true, dt_null) + expect_identical(dt_null, dt_test) +}) + +test_that("as_of produces the same results with compactify=TRUE as with compactify=FALSE", { + ea_true <- as_epi_archive2(dt, compactify = TRUE) + ea_false <- as_epi_archive2(dt, compactify = FALSE) + + # Row 22, an LOCF row corresponding to the latest version, is omitted in + # ea_true + latest_version <- max(ea_false$DT$version) + as_of_true <- as_of(ea_true, latest_version) + as_of_false <- as_of(ea_false, latest_version) + + expect_identical(as_of_true, as_of_false) +}) + +test_that("compactify does not alter the default clobberable and observed version bounds", { + x <- tibble::tibble( + geo_value = "geo1", + time_value = as.Date("2000-01-01"), + version = as.Date("2000-01-01") + 1:5, + value = 42L + ) + ea_true <- as_epi_archive2(x, compactify = TRUE) + ea_false <- as_epi_archive2(x, compactify = FALSE) + # We say that we base the bounds on the user's `x` arg. We might mess up or + # change our minds and base things on the `DT` field (or a temporary `DT` + # variable, post-compactify) instead. Check that this test would trigger + # in that case: + expect_true(max(ea_true$DT$version) != max(ea_false$DT$version)) + # The actual test: + expect_identical(ea_true$clobberable_versions_start, ea_false$clobberable_versions_start) + expect_identical(ea_true$versions_end, ea_false$versions_end) +}) diff --git a/tests/testthat/test-epix_fill_through_version_new.R b/tests/testthat/test-epix_fill_through_version_new.R new file mode 100644 index 00000000..2b76a851 --- /dev/null +++ b/tests/testthat/test-epix_fill_through_version_new.R @@ -0,0 +1,109 @@ +test_that("epix_fill_through_version2 mirrors input when it is sufficiently up to date", { + ea_orig <- as_epi_archive2(data.table::data.table( + geo_value = "g1", time_value = as.Date("2020-01-01"), + version = 1:5, value = 1:5 + )) + some_earlier_observed_version <- 2L + ea_trivial_fill_na1 <- epix_fill_through_version2(ea_orig, some_earlier_observed_version, "na") + ea_trivial_fill_na2 <- epix_fill_through_version2(ea_orig, ea_orig$versions_end, "na") + ea_trivial_fill_locf <- epix_fill_through_version2(ea_orig, some_earlier_observed_version, "locf") + # Below, we want R6 objects to be compared based on contents rather than + # addresses. We appear to get this with `expect_identical` in `testthat` + # edition 3, which is based on `waldo::compare` rather than `base::identical`; + # `waldo::compare` in waldo >=0.3.1 appears (as of 0.4.0) to compare R6 + # objects by contents rather than address (in a way that is tested but maybe + # not guaranteed via user docs). Use `testthat::local_edition` to ensure we + # use testthat edition 3 here (use `testthat::` to prevent ambiguity with + # `readr`). + testthat::local_edition(3) + expect_identical(ea_orig, ea_trivial_fill_na1) + expect_identical(ea_orig, ea_trivial_fill_na2) + expect_identical(ea_orig, ea_trivial_fill_locf) +}) + +test_that("epix_fill_through_version2 can extend observed versions, gives expected `as_of`s", { + ea_orig <- as_epi_archive2(data.table::data.table( + geo_value = "g1", + time_value = as.Date("2020-01-01") + c(rep(0L, 5L), 1L), + version = c(1:5, 2L), + value = 1:6 + )) + first_unobserved_version <- 6L + later_unobserved_version <- 10L + ea_fill_na <- epix_fill_through_version2(ea_orig, later_unobserved_version, "na") + ea_fill_locf <- epix_fill_through_version2(ea_orig, later_unobserved_version, "locf") + + # We use testthat edition 3 features here, passing `ignore_attr` to + # `waldo::compare`. Ensure we are using edition 3: + testthat::local_edition(3) + withCallingHandlers( + { + expect_identical(ea_fill_na$versions_end, later_unobserved_version) + expect_identical(tibble::as_tibble(as_of(ea_fill_na, first_unobserved_version)), + tibble::tibble(geo_value = "g1", time_value = as.Date("2020-01-01") + 0:1, value = rep(NA_integer_, 2L)), + ignore_attr = TRUE + ) + expect_identical(ea_fill_locf$versions_end, later_unobserved_version) + expect_identical( + as_of(ea_fill_locf, first_unobserved_version), + as_of(ea_fill_locf, ea_orig$versions_end) %>% + { + attr(., "metadata")$as_of <- first_unobserved_version + . + } + ) + }, + epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") + ) +}) + +test_that("epix_fill_through_version2 does not mutate x", { + for (ea_orig in list( + # vanilla case + as_epi_archive2(data.table::data.table( + geo_value = "g1", time_value = as.Date("2020-01-01"), + version = 1:5, value = 1:5 + )), + # data.table unique yielding original DT by reference special case (maybe + # having only 1 row is the trigger? having no revisions of initial values + # doesn't seem sufficient to trigger) + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) + )) { + # We want to perform a strict comparison of the contents of `ea_orig` before + # and `ea_orig` after. `clone` + `expect_identical` based on waldo would + # sort of work, but we might want something stricter. `as.list` + + # `identical` plus a check of the DT seems to do the trick. + ea_orig_before_as_list <- as.list(ea_orig) + ea_orig_DT_before_copy <- data.table::copy(ea_orig$DT) + some_unobserved_version <- 8L + # + ea_fill_na <- epix_fill_through_version2(ea_orig, some_unobserved_version, "na") + ea_orig_after_as_list <- as.list(ea_orig) + # use identical, not expect_identical, for the R6-as-list test; latter isn't as strict + expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) + expect_identical(ea_orig_DT_before_copy, ea_orig$DT) + # + ea_fill_locf <- epix_fill_through_version2(ea_orig, some_unobserved_version, "locf") + ea_orig_after_as_list <- as.list(ea_orig) + expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) + expect_identical(ea_orig_DT_before_copy, ea_orig$DT) + } +}) + +test_that("epix_fill_through_version return with expected visibility", { + ea <- as_epi_archive(data.table::data.table( + geo_value = "g1", time_value = as.Date("2020-01-01"), + version = 1:5, value = 1:5 + )) + expect_true(withVisible(epix_fill_through_version(ea, 10L, "na"))[["visible"]]) +}) + +test_that("epix_fill_through_version2 returns same key & doesn't mutate old DT or its key", { + ea <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) + old_DT <- ea$DT + old_DT_copy <- data.table::copy(old_DT) + old_key <- data.table::key(ea$DT) + expect_identical(data.table::key(epix_fill_through_version2(ea, 5L, "na")$DT), old_key) + expect_identical(data.table::key(epix_fill_through_version2(ea, 5L, "locf")$DT), old_key) + expect_identical(data.table::key(ea$DT), old_key) +}) diff --git a/tests/testthat/test-epix_merge_new.R b/tests/testthat/test-epix_merge_new.R new file mode 100644 index 00000000..10041dbb --- /dev/null +++ b/tests/testthat/test-epix_merge_new.R @@ -0,0 +1,226 @@ +test_that("epix_merge requires forbids on invalid `y`", { + ea <- archive_cases_dv_subset_2 %>% + clone() + expect_error(epix_merge2(ea, data.frame(x = 1))) +}) + +test_that("epix_merge merges and carries forward updates properly", { + x <- as_epi_archive2( + data.table::as.data.table( + tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, + # same version set for x and y + "g1", 1L, 1:3, paste0("XA", 1:3), + # versions of x surround those of y + this measurement has + # max update version beyond some others + "g1", 2L, 1:5, paste0("XB", 1:5), + # mirror case + "g1", 3L, 2L, paste0("XC", 2L), + # x has 1 version, y has 0 + "g1", 4L, 1L, paste0("XD", 1L), + # non-NA values that should be carried forward + # (version-wise LOCF) in other versions, plus NAs that + # should (similarly) be carried forward as NA (latter + # wouldn't work with an ordinary merge + post-processing + # with `data.table::nafill`) + "g1", 6L, c(1L, 3L, 5L), paste0("XE", c(1L, NA, 5L)) + ) %>% + tidyr::unchop(c(version, x_value)) %>% + dplyr::mutate(dplyr::across(c(x_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) + ) + ) + y <- as_epi_archive2( + data.table::as.data.table( + tibble::tribble( + ~geo_value, ~time_value, ~version, ~y_value, + "g1", 1L, 1:3, paste0("YA", 1:3), + "g1", 2L, 2L, paste0("YB", 2L), + "g1", 3L, 1:5, paste0("YC", 1:5), + "g1", 5L, 1L, paste0("YD", 1L), + "g1", 6L, 1:5, paste0("YE", 1:5), + ) %>% + tidyr::unchop(c(version, y_value)) %>% + dplyr::mutate(dplyr::across(c(y_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) + ) + ) + xy <- epix_merge2(x, y) + xy_expected <- as_epi_archive2( + data.table::as.data.table( + tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, ~y_value, + "g1", 1L, 1:3, paste0("XA", 1:3), paste0("YA", 1:3), + "g1", 2L, 1:5, paste0("XB", 1:5), paste0("YB", c(NA, 2L, 2L, 2L, 2L)), + "g1", 3L, 1:5, paste0("XC", c(NA, 2L, 2L, 2L, 2L)), paste0("YC", 1:5), + "g1", 4L, 1L, paste0("XD", 1L), paste0("YD", NA), + "g1", 5L, 1L, paste0("XD", NA), paste0("YD", 1L), + "g1", 6L, 1:5, paste0("XE", c(1L, 1L, NA, NA, 5L)), paste0("YE", 1:5), + ) %>% + tidyr::unchop(c(version, x_value, y_value)) %>% + dplyr::mutate(dplyr::across(c(x_value, y_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) + ) + ) + # We rely on testthat edition 3 expect_identical using waldo, not identical. See + # test-epix_fill_through_version.R comments for details. + testthat::local_edition(3) + expect_identical(xy, xy_expected) +}) + +test_that("epix_merge forbids and warns on metadata and naming issues", { + expect_error( + epix_merge2( + as_epi_archive2(tibble::tibble(geo_value = "tx", time_value = 1L, version = 1L, x_value = 1L)), + as_epi_archive2(tibble::tibble(geo_value = "us", time_value = 1L, version = 5L, y_value = 2L)) + ), + regexp = "must have the same.*geo_type" + ) + expect_error( + epix_merge2( + as_epi_archive2(tibble::tibble(geo_value = "pa", time_value = 1L, version = 1L, x_value = 1L)), + as_epi_archive2(tibble::tibble(geo_value = "pa", time_value = as.Date("2020-01-01"), version = 5L, y_value = 2L)) + ), + regexp = "must have the same.*time_type" + ) + expect_error( + epix_merge2( + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 1L)), + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 2L)) + ), + regexp = "overlapping.*names" + ) + expect_warning( + epix_merge2( + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L), + additional_metadata = list("updates_fetched" = lubridate::ymd_hms("2022-05-01 16:00:00", tz = "UTC")) + ), + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L)) + ), + regexp = "x\\$additional_metadata", + class = "epiprocess__epix_merge_ignores_additional_metadata" + ) + expect_warning( + epix_merge2( + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L)), + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L), + additional_metadata = list("updates_fetched" = lubridate::ymd_hms("2022-05-01 16:00:00", tz = "UTC")) + ) + ), + regexp = "y\\$additional_metadata", + class = "epiprocess__epix_merge_ignores_additional_metadata" + ) +}) + +# use `local` to prevent accidentally using the x, y, xy bindings here +# elsewhere, while allowing reuse across a couple tests +local({ + x <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L), + clobberable_versions_start = 1L, versions_end = 10L + ) + y <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L), + clobberable_versions_start = 3L, versions_end = 10L + ) + xy <- epix_merge2(x, y) + test_that("epix_merge considers partially-clobberable row to be clobberable", { + expect_identical(xy$clobberable_versions_start, 1L) + }) + test_that("epix_merge result uses versions_end metadata not max version val", { + expect_identical(xy$versions_end, 10L) + }) +}) + +local({ + x <- as_epi_archive2( + tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 10L), + clobberable_versions_start = 1L, + versions_end = 3L + ) + y <- as_epi_archive2( + tibble::tibble(geo_value = 1L, time_value = 1L, version = 5L, y_value = 20L), + clobberable_versions_start = 1L + ) + test_that('epix_merge forbids on sync default or "forbid"', { + expect_error(epix_merge2(x, y), + class = "epiprocess__epix_merge_unresolved_sync" + ) + expect_error(epix_merge2(x, y, sync = "forbid"), + class = "epiprocess__epix_merge_unresolved_sync" + ) + }) + test_that('epix_merge sync="na" works', { + expect_equal( + epix_merge2(x, y, sync = "na"), + as_epi_archive2(tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, ~y_value, + 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet + 1L, 1L, 4L, NA_integer_, NA_integer_, # NA-ing out x, y not observed yet + 1L, 1L, 5L, NA_integer_, 20L, # x still NA, y updated + # (we should not have a y vals -> NA update here; version 5 should be + # the `versions_end` of the result) + ), clobberable_versions_start = 1L) + ) + }) + test_that('epix_merge sync="locf" works', { + expect_equal( + epix_merge2(x, y, sync = "locf"), + as_epi_archive2(tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, ~y_value, + 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet + 1L, 1L, 5L, 10L, 20L, # x LOCF'd, y updated + ), clobberable_versions_start = 1L) + ) + }) + test_that('epix_merge sync="truncate" works', { + expect_equal( + epix_merge2(x, y, sync = "truncate"), + as_epi_archive2(tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, ~y_value, + 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet + # y's update beyond x's last update has been truncated + ), clobberable_versions_start = 1L, versions_end = 3L) + ) + }) + x_no_conflict <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 10L)) + y_no_conflict <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 20L)) + xy_no_conflict_expected <- as_epi_archive2(tibble::tribble( + ~geo_value, ~time_value, ~version, ~x_value, ~y_value, + 1L, 1L, 1L, 10L, 20L, # x updated, y not observed yet + )) + test_that('epix_merge sync="forbid" on no-conflict works', { + expect_equal( + epix_merge2(x_no_conflict, y_no_conflict, sync = "forbid"), + xy_no_conflict_expected + ) + }) + test_that('epix_merge sync="na" on no-conflict works', { + # This test is the main reason for these no-conflict tests. We want to make + # sure that we don't add an unnecessary NA-ing-out version beyond a common + # versions_end. + expect_equal( + epix_merge2(x_no_conflict, y_no_conflict, sync = "na"), + xy_no_conflict_expected + ) + }) + test_that('epix_merge sync="locf" on no-conflict works', { + expect_equal( + epix_merge2(x_no_conflict, y_no_conflict, sync = "locf"), + xy_no_conflict_expected + ) + }) + test_that('epix_merge sync="truncate" on no-conflict works', { + expect_equal( + epix_merge2(x_no_conflict, y_no_conflict, sync = "truncate"), + xy_no_conflict_expected + ) + }) +}) + + +test_that('epix_merge sync="na" balks if do not know next_after', { + expect_error( + epix_merge2( + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = as.POSIXct(as.Date("2020-01-01")), x_value = 10L)), + as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = as.POSIXct(as.Date("2020-01-02")), y_value = 20L)), + sync = "na" + ), + regexp = "no applicable method.*next_after" + ) +}) diff --git a/tests/testthat/test-epix_slide_new.R b/tests/testthat/test-epix_slide_new.R new file mode 100644 index 00000000..49ef5e41 --- /dev/null +++ b/tests/testthat/test-epix_slide_new.R @@ -0,0 +1,810 @@ +library(dplyr) + +test_that("epix_slide2 only works on an epi_archive", { + expect_error(epix_slide2(data.frame(x = 1))) +}) + +x <- tibble::tribble( + ~version, ~time_value, ~binary, + 4, c(1:3), 2^(1:3), + 5, c(1:2, 4), 2^(4:6), + 6, c(1:2, 4:5), 2^(7:10), + 7, 2:6, 2^(11:15) +) %>% + tidyr::unnest(c(time_value, binary)) + +xx <- bind_cols(geo_value = rep("x", 15), x) %>% + as_epi_archive2() + +test_that("epix_slide2 works as intended", { + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~ sum(.x$binary), + before = 2, + new_col_name = "sum_binary" + ) + + xx2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + sum_binary = c( + 2^3 + 2^2, + 2^6 + 2^3, + 2^10 + 2^9, + 2^15 + 2^14 + ) + ) %>% + group_by(geo_value) + + expect_identical(xx1, xx2) # * + + xx3 <- xx %>% + group_by( + dplyr::across(dplyr::all_of("geo_value")) + ) %>% + slide( + f = ~ sum(.x$binary), + before = 2, + new_col_name = "sum_binary" + ) + + expect_identical(xx1, xx3) # This and * imply xx2 and xx3 are identical + + # function interface + xx4 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2(f = function(x, gk, rtv) { + tibble::tibble(sum_binary = sum(x$binary)) + }, before = 2, names_sep = NULL) + + expect_identical(xx1, xx4) + + # tidyeval interface + xx5 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + sum_binary = sum(binary), + before = 2 + ) + + expect_identical(xx1, xx5) +}) + +test_that("epix_slide2 works as intended with `as_list_col=TRUE`", { + xx_dfrow1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~ data.frame(bin_sum = sum(.x$binary)), + before = 2, + as_list_col = TRUE + ) + + xx_dfrow2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = + c( + 2^3 + 2^2, + 2^6 + 2^3, + 2^10 + 2^9, + 2^15 + 2^14 + ) %>% + purrr::map(~ data.frame(bin_sum = .x)) + ) %>% + group_by(geo_value) + + expect_identical(xx_dfrow1, xx_dfrow2) # * + + xx_dfrow3 <- xx %>% + group_by(dplyr::across(dplyr::all_of("geo_value"))) %>% + slide( + f = ~ data.frame(bin_sum = sum(.x$binary)), + before = 2, + as_list_col = TRUE + ) + + expect_identical(xx_dfrow1, xx_dfrow3) # This and * Imply xx_dfrow2 and xx_dfrow3 are identical + + xx_df1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~ data.frame(bin = .x$binary), + before = 2, + as_list_col = TRUE + ) + + xx_df2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = + list( + c(2^3, 2^2), + c(2^6, 2^3), + c(2^10, 2^9), + c(2^15, 2^14) + ) %>% + purrr::map(~ data.frame(bin = rev(.x))) + ) %>% + group_by(geo_value) + + expect_identical(xx_df1, xx_df2) + + xx_scalar1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~ sum(.x$binary), + before = 2, + as_list_col = TRUE + ) + + xx_scalar2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = + list( + 2^3 + 2^2, + 2^6 + 2^3, + 2^10 + 2^9, + 2^15 + 2^14 + ) + ) %>% + group_by(geo_value) + + expect_identical(xx_scalar1, xx_scalar2) + + xx_vec1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~ .x$binary, + before = 2, + as_list_col = TRUE + ) + + xx_vec2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = + list( + c(2^3, 2^2), + c(2^6, 2^3), + c(2^10, 2^9), + c(2^15, 2^14) + ) %>% + purrr::map(rev) + ) %>% + group_by(geo_value) + + expect_identical(xx_vec1, xx_vec2) +}) + +test_that("epix_slide2 `before` validation works", { + expect_error( + slide(xx, f = ~ sum(.x$binary)), + "`before` is required" + ) + expect_error( + slide(xx, f = ~ sum(.x$binary), before = NA), + "Assertion on 'before' failed: May not be NA" + ) + expect_error( + slide(xx, f = ~ sum(.x$binary), before = -1), + "Assertion on 'before' failed: Element 1 is not >= 0" + ) + expect_error(slide(xx, f = ~ sum(.x$binary), before = 1.5), + regexp = "before", + class = "vctrs_error_incompatible_type" + ) + # We might want to allow this at some point (issue #219): + expect_error(slide(xx, f = ~ sum(.x$binary), before = Inf), + regexp = "before", + class = "vctrs_error_incompatible_type" + ) + # (wrapper shouldn't introduce a value:) + expect_error(epix_slide2(xx, f = ~ sum(.x$binary)), "`before` is required") + # These `before` values should be accepted: + expect_error( + slide(xx, f = ~ sum(.x$binary), before = 0), + NA + ) + expect_error( + slide(xx, f = ~ sum(.x$binary), before = 2L), + NA + ) + expect_error( + slide(xx, f = ~ sum(.x$binary), before = 365000), + NA + ) +}) + +test_that("quosure passing issue in epix_slide2 is resolved + other potential issues", { + # (First part adapted from @examples) + time_values <- seq(as.Date("2020-06-01"), + as.Date("2020-06-02"), + by = "1 day" + ) + # We only have one non-version, non-time key in the example archive. Add + # another so that we don't accidentally pass tests due to accidentally + # matching the default grouping. + ea <- as_epi_archive2( + archive_cases_dv_subset$DT %>% + dplyr::mutate(modulus = seq_len(nrow(.)) %% 5L), + other_keys = "modulus", + compactify = TRUE + ) + reference_by_modulus <- ea %>% + group_by(modulus) %>% + epix_slide2( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ) + reference_by_neither <- ea %>% + group_by() %>% + epix_slide2( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ) + # test the passing-something-that-must-be-enquosed behavior: + # + # (S3 group_by behavior for this case is the `reference_by_modulus`) + expect_identical( + ea %>% group_by(modulus) %>% slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + # test the .data pronoun behavior: + expect_identical( + epix_slide2( + x = ea %>% group_by(.data$modulus), + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + expect_identical( + ea %>% group_by(.data$modulus) %>% slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + # test the passing across-all-of-string-literal behavior: + expect_identical( + epix_slide2( + x = ea %>% group_by(dplyr::across(all_of("modulus"))), + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + expect_identical( + ea %>% group_by(across(all_of("modulus"))) %>% slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + # test the passing-across-all-of-string-var behavior: + my_group_by <- "modulus" + expect_identical( + epix_slide2( + x = ea %>% group_by(dplyr::across(tidyselect::all_of(my_group_by))), + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + expect_identical( + ea %>% group_by(dplyr::across(tidyselect::all_of(my_group_by))) %>% slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_modulus + ) + # test the default behavior (default in this case should just be grouping by neither): + expect_identical( + epix_slide2( + x = ea, + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_neither + ) + expect_identical( + ea %>% slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), + reference_by_neither + ) +}) + +ea <- tibble::tribble( + ~version, ~time_value, ~binary, + 2, 1:1, 2^(1:1), + 3, 1:2, 2^(2:1), + 4, 1:3, 2^(3:1), + 5, 1:4, 2^(4:1), + 6, 1:5, 2^(5:1), + 7, 1:6, 2^(6:1) +) %>% + tidyr::unnest(c(time_value, binary)) %>% + mutate(geo_value = "x") %>% + as_epi_archive2() + +test_that("epix_slide2 with all_versions option has access to all older versions", { + library(data.table) + # Make sure we're using testthat edition 3, where `expect_identical` doesn't + # actually mean `base::identical` but something more content-based using + # `waldo` package: + testthat::local_edition(3) + + slide_fn <- function(x, gk, rtv) { + return(tibble( + n_versions = length(unique(x$DT$version)), + n_row = nrow(x$DT), + dt_class1 = class(x$DT)[[1L]], + dt_key = list(key(x$DT)) + )) + } + + ea_orig_mirror <- ea %>% clone(deep = TRUE) + ea_orig_mirror$DT <- copy(ea_orig_mirror$DT) + + result1 <- ea %>% + group_by() %>% + epix_slide2( + f = slide_fn, + before = 10^3, + names_sep = NULL, + all_versions = TRUE + ) + + expect_true(inherits(result1, "tbl_df")) + + result2 <- tibble::tribble( + ~time_value, ~n_versions, ~n_row, ~dt_class1, ~dt_key, + 2, 1L, sum(1:1), "data.table", key(ea$DT), + 3, 2L, sum(1:2), "data.table", key(ea$DT), + 4, 3L, sum(1:3), "data.table", key(ea$DT), + 5, 4L, sum(1:4), "data.table", key(ea$DT), + 6, 5L, sum(1:5), "data.table", key(ea$DT), + 7, 6L, sum(1:6), "data.table", key(ea$DT), + ) + + expect_identical(result1, result2) # * + + result3 <- ea %>% + group_by() %>% + slide( + f = slide_fn, + before = 10^3, + names_sep = NULL, + all_versions = TRUE + ) + + expect_identical(result1, result3) # This and * Imply result2 and result3 are identical + + # formula interface + result4 <- ea %>% + group_by() %>% + epix_slide2( + f = ~ slide_fn(.x, .y), + before = 10^3, + names_sep = NULL, + all_versions = TRUE + ) + + expect_identical(result1, result4) # This and * Imply result2 and result4 are identical + + # tidyeval interface + result5 <- ea %>% + group_by() %>% + epix_slide2( + data = slide_fn( + .x, + stop("slide_fn doesn't use group key, no need to prepare it") + ), + before = 10^3, + names_sep = NULL, + all_versions = TRUE + ) + + expect_identical(result1, result5) # This and * Imply result2 and result5 are identical + expect_identical(ea, ea_orig_mirror) # We shouldn't have mutated ea +}) + +test_that("as_of and epix_slide2 with long enough window are compatible", { + library(data.table) + testthat::local_edition(3) + + # For all_versions = FALSE: + + f1 <- function(x, gk, rtv) { + tibble( + diff_mean = mean(diff(x$binary)) + ) + } + ref_time_value1 <- 5 + + expect_identical( + ea %>% as_of(ref_time_value1) %>% f1() %>% mutate(time_value = ref_time_value1, .before = 1L), + ea %>% slide(f1, before = 1000L, ref_time_values = ref_time_value1, names_sep = NULL) + ) + + # For all_versions = TRUE: + + f2 <- function(x, gk, rtv) { + x %>% + # extract time&version-lag-1 data: + epix_slide2( + function(subx, subgk, rtv) { + tibble(data = list( + subx %>% + filter(time_value == attr(subx, "metadata")$as_of - 1) %>% + rename(real_time_value = time_value, lag1 = binary) + )) + }, + before = 1, names_sep = NULL + ) %>% + # assess as nowcast: + unnest(data) %>% + inner_join(x %>% as_of(x$versions_end), by = setdiff(key(x$DT), c("version"))) %>% + summarize(mean_abs_delta = mean(abs(binary - lag1))) + } + ref_time_value2 <- 5 + + expect_identical( + ea %>% as_of(ref_time_value2, all_versions = TRUE) %>% f2() %>% mutate(time_value = ref_time_value2, .before = 1L), + ea %>% slide(f2, before = 1000L, ref_time_values = ref_time_value2, all_versions = TRUE, names_sep = NULL) + ) + + # Test the same sort of thing when grouping by geo in an archive with multiple geos. + ea_multigeo <- ea %>% clone() + ea_multigeo$DT <- rbind( + ea_multigeo$DT, + copy(ea_multigeo$DT)[, geo_value := "y"][, binary := -binary][] + ) + setkeyv(ea_multigeo$DT, key(ea$DT)) + + expect_identical( + ea_multigeo %>% + group_by(geo_value) %>% + epix_slide2(f2, before = 1000L, ref_time_values = ref_time_value2, all_versions = TRUE, names_sep = NULL) %>% + filter(geo_value == "x"), + ea %>% # using `ea` here is like filtering `ea_multigeo` to `geo_value=="x"` + epix_as_of2(ref_time_value2, all_versions = TRUE) %>% + f2() %>% + transmute(geo_value = "x", time_value = ref_time_value2, mean_abs_delta) %>% + group_by(geo_value) + ) +}) + +test_that("epix_slide2 `f` is passed an ungrouped `epi_archive` when `all_versions=TRUE`", { + slide_fn <- function(x, gk, rtv) { + expect_true(is_epi_archive2(x)) + return(NA) + } + + ea %>% + group_by() %>% + epix_slide2( + f = slide_fn, + before = 1, + ref_time_values = 5, + new_col_name = "out", + all_versions = TRUE + ) +}) + +test_that("epix_slide2 with all_versions option works as intended", { + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~ sum(.x$DT$binary), + before = 2, + new_col_name = "sum_binary", + all_versions = TRUE + ) + + xx2 <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + sum_binary = c( + 2^3 + 2^2, + 2^6 + 2^3, + 2^10 + 2^9 + 2^6, + 2^15 + 2^14 + 2^10 + ) + ) %>% + group_by(geo_value) + + expect_identical(xx1, xx2) # * + + xx3 <- xx %>% + group_by(dplyr::across(dplyr::all_of("geo_value"))) %>% + slide( + f = ~ sum(.x$DT$binary), + before = 2, + new_col_name = "sum_binary", + all_versions = TRUE + ) + + expect_identical(xx1, xx3) # This and * Imply xx2 and xx3 are identical +}) + +# XXX currently, we're using a stopgap measure of having `epix_slide2` always +# output a (grouped/ungrouped) tibble while we think about the class, columns, +# and attributes of `epix_slide2` output more carefully. We might bring this test +# back depending on the decisions there: +# +# test_that("`epix_slide2` uses `versions_end` as a resulting `epi_df`'s `as_of`", { +# ea_updated_stale = ea$clone() +# ea_updated_stale$versions_end <- ea_updated_stale$versions_end + 3 # (dbl) +# # +# expect_identical( +# ea_updated_stale %>% +# group_by(geo_value) %>% +# epix_slide2(~ slice_head(.x, n = 1L), before = 10L) %>% +# ungroup() %>% +# attr("metadata") %>% +# .$as_of, +# 10 +# ) +# }) + +test_that("epix_slide2 works with 0-row computation outputs", { + epix_slide_empty <- function(ea, ...) { + ea %>% + epix_slide2(before = 5L, ..., function(x, gk, rtv) { + tibble::tibble() + }) + } + expect_identical( + ea %>% + epix_slide_empty(), + tibble::tibble( + time_value = ea$DT$version[integer(0)] + ) + ) + expect_identical( + ea %>% + group_by(geo_value) %>% + epix_slide_empty(), + tibble::tibble( + geo_value = ea$DT$geo_value[integer(0)], + time_value = ea$DT$version[integer(0)] + ) %>% + # new_epi_df(geo_type = ea$geo_type, time_type = ea$time_type, + # as_of = ea$versions_end) %>% + group_by(geo_value) + ) + # with `all_versions=TRUE`, we have something similar but never get an + # `epi_df`: + expect_identical( + ea %>% + epix_slide_empty(all_versions = TRUE), + tibble::tibble( + time_value = ea$DT$version[integer(0)] + ) + ) + expect_identical( + ea %>% + group_by(geo_value) %>% + epix_slide_empty(all_versions = TRUE), + tibble::tibble( + geo_value = ea$DT$geo_value[integer(0)], + time_value = ea$DT$version[integer(0)] + ) %>% + group_by(geo_value) + ) +}) + +# test_that("epix_slide grouped by geo can produce `epi_df` output", { +# # This is a characterization test. Not sure we actually want this behavior; +# # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 +# expect_identical( +# ea %>% +# group_by(geo_value) %>% +# epix_slide(before = 5L, function(x,g) { +# tibble::tibble(value = 42) +# }, names_sep = NULL), +# tibble::tibble( +# geo_value = "x", +# time_value = epix_slide_ref_time_values_default(ea), +# value = 42 +# ) %>% +# new_epi_df(as_of = ea$versions_end) +# ) +# }) + +test_that("epix_slide alerts if the provided f doesn't take enough args", { + f_xgt <- function(x, g, t) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + # If `regexp` is NA, asserts that there should be no errors/messages. + expect_error(epix_slide2(xx, f = f_xgt, before = 2L), regexp = NA) + expect_warning(epix_slide2(xx, f = f_xgt, before = 2L), regexp = NA) + + f_x_dots <- function(x, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) + expect_warning(epix_slide2(xx, f_x_dots, before = 2L), + class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" + ) +}) + +test_that("epix_slide2 computation via formula can use ref_time_value", { + xx_ref <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = c(4, 5, 6, 7) + ) %>% + group_by(geo_value) + + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~.ref_time_value, + before = 2 + ) + + expect_identical(xx1, xx_ref) + + xx2 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~.z, + before = 2 + ) + + expect_identical(xx2, xx_ref) + + xx3 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = ~..3, + before = 2 + ) + + expect_identical(xx3, xx_ref) +}) + +test_that("epix_slide2 computation via function can use ref_time_value", { + xx_ref <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = c(4, 5, 6, 7) + ) %>% + group_by(geo_value) + + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + f = function(x, g, t) t, + before = 2 + ) + + expect_identical(xx1, xx_ref) +}) + +test_that("epix_slide2 computation via dots can use ref_time_value and group", { + # ref_time_value + xx_ref <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = c(4, 5, 6, 7) + ) %>% + group_by(geo_value) + + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + before = 2, + slide_value = .ref_time_value + ) + + expect_identical(xx1, xx_ref) + + # group_key + xx_ref <- tibble( + geo_value = rep("x", 4), + time_value = c(4, 5, 6, 7), + slide_value = "x" + ) %>% + group_by(geo_value) + + # Use group_key column + xx3 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + before = 2, + slide_value = .group_key$geo_value + ) + + expect_identical(xx3, xx_ref) + + # Use entire group_key object + expect_error( + xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + before = 2, + slide_value = nrow(.group_key) + ), + NA + ) +}) + +test_that("epix_slide2 computation via dots outputs the same result using col names and the data var", { + xx_ref <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + before = 2, + sum_binary = sum(time_value) + ) + + xx1 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + before = 2, + sum_binary = sum(.x$time_value) + ) + + expect_identical(xx1, xx_ref) + + xx2 <- xx %>% + group_by(.data$geo_value) %>% + epix_slide2( + before = 2, + sum_binary = sum(.data$time_value) + ) + + expect_identical(xx2, xx_ref) +}) + +test_that("`epix_slide2` doesn't decay date output", { + expect_true( + xx$DT %>% + as_tibble() %>% + mutate(across(c(time_value, version), ~ as.Date("2000-01-01") + .x - 1L)) %>% + as_epi_archive2() %>% + epix_slide2(before = 5L, ~ attr(.x, "metadata")$as_of) %>% + `[[`("slide_value") %>% + inherits("Date") + ) +}) + +test_that("`epix_slide2` can access objects inside of helper functions", { + helper <- function(archive_haystack, time_value_needle) { + archive_haystack %>% epix_slide2(has_needle = time_value_needle %in% time_value, before = 365000L) + } + expect_error( + helper(archive_cases_dv_subset_2, as.Date("2021-01-01")), + NA + ) + expect_error( + helper(xx, 3L), + NA + ) +}) diff --git a/tests/testthat/test-grouped_epi_archive_new.R b/tests/testthat/test-grouped_epi_archive_new.R new file mode 100644 index 00000000..8f0133b9 --- /dev/null +++ b/tests/testthat/test-grouped_epi_archive_new.R @@ -0,0 +1,104 @@ +test_that("Grouping, regrouping, and ungrouping archives works as intended", { + # From an example: + library(dplyr) + toy_archive <- + tribble( + ~geo_value, ~age_group, ~time_value, ~version, ~value, + "us", "adult", "2000-01-01", "2000-01-02", 121, + "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) + "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) + "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value), + version = as.Date(version) + ) %>% + as_epi_archive2(other_keys = "age_group") + + # Ensure that we're using testthat edition 3's idea of "identical", which is + # not as strict as `identical`: + testthat::local_edition(3) + + # Test equivalency claims in example: + by_both_keys <- toy_archive %>% group_by(geo_value, age_group) + expect_identical( + by_both_keys, + toy_archive %>% group_by(geo_value) %>% group_by(age_group, .add = TRUE) + ) + grouping_cols <- c("geo_value", "age_group") + expect_identical( + by_both_keys, + toy_archive %>% group_by(across(all_of(grouping_cols))) + ) + + expect_identical( + toy_archive %>% group_by(geo_value), + toy_archive %>% group_by(geo_value, age_group) %>% ungroup(age_group) + ) + + # Test `.drop` behavior: + expect_error(toy_archive %>% group_by(.drop = "bogus"), + regexp = "Must be of type 'logical', not 'character'" + ) + expect_warning(toy_archive %>% group_by(.drop = FALSE), + class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" + ) + expect_warning(toy_archive %>% group_by(geo_value, .drop = FALSE), + class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" + ) + expect_warning( + grouped_factor_then_nonfactor <- + toy_archive %>% group_by(age_group, geo_value, .drop = FALSE), + class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor" + ) + expect_identical( + grouped_factor_then_nonfactor %>% + epix_slide2(before = 10, s = sum(value)), + tibble::tribble( + ~age_group, ~geo_value, ~time_value, ~s, + "pediatric", NA_character_, "2000-01-02", 0, + "adult", "us", "2000-01-02", 121, + "pediatric", "us", "2000-01-03", 5, + "adult", "us", "2000-01-03", 255 + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value) + ) %>% + # # See + # # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 + # # and + # # https://github.com/cmu-delphi/epiprocess/pull/311#issuecomment-1535149256 + # # for why this is commented out, pending some design + # # decisions. + # # + # as_epi_df(geo_type = "nation", # bug; want "custom" from NA; issue #242 + # as_of = as.Date("2000-01-03"), + # additional_metadata = list(other_keys = "age_group")) %>% + # # put back in expected order; see issue #166: + # select(age_group, geo_value, time_value, s) %>% + group_by(age_group, geo_value, .drop = FALSE) + ) + expect_identical( + toy_archive %>% + group_by(geo_value, age_group, .drop = FALSE) %>% + epix_slide2(before = 10, s = sum(value)), + tibble::tribble( + ~geo_value, ~age_group, ~time_value, ~s, + "us", "pediatric", "2000-01-02", 0, + "us", "adult", "2000-01-02", 121, + "us", "pediatric", "2000-01-03", 5, + "us", "adult", "2000-01-03", 255 + ) %>% + mutate( + age_group = ordered(age_group, c("pediatric", "adult")), + time_value = as.Date(time_value) + ) %>% + # as_epi_df(as_of = as.Date("2000-01-03"), + # additional_metadata = list(other_keys = "age_group")) %>% + # # put back in expected order; see issue #166: + # select(geo_value, age_group, time_value, s) %>% + group_by(geo_value, age_group, .drop = FALSE) + ) +}) diff --git a/tests/testthat/test-methods-epi_archive_new.R b/tests/testthat/test-methods-epi_archive_new.R new file mode 100644 index 00000000..eb2c14be --- /dev/null +++ b/tests/testthat/test-methods-epi_archive_new.R @@ -0,0 +1,136 @@ +library(dplyr) + +ea <- archive_cases_dv_subset_2 %>% + clone() + +ea2_data <- tibble::tribble( + ~geo_value, ~time_value, ~version, ~cases, + "ca", "2020-06-01", "2020-06-01", 1, + "ca", "2020-06-01", "2020-06-02", 2, + # + "ca", "2020-06-02", "2020-06-02", 0, + "ca", "2020-06-02", "2020-06-03", 1, + "ca", "2020-06-02", "2020-06-04", 2, + # + "ca", "2020-06-03", "2020-06-03", 1, + # + "ca", "2020-06-04", "2020-06-04", 4, +) %>% + dplyr::mutate(dplyr::across(c(time_value, version), as.Date)) + +# epix_as_of tests +test_that("epix_as_of behaves identically to as_of method", { + expect_identical( + epix_as_of2(ea, max_version = min(ea$DT$version)), + ea %>% as_of(max_version = min(ea$DT$version)) + ) +}) + +test_that("Errors are thrown due to bad as_of inputs", { + # max_version cannot be of string class rather than date class + expect_error(ea %>% as_of("2020-01-01")) + # max_version cannot be later than latest version + expect_error(ea %>% as_of(as.Date("2025-01-01"))) + # max_version cannot be a vector + expect_error(ea %>% as_of(c(as.Date("2020-01-01"), as.Date("2020-01-02")))) +}) + +test_that("Warning against max_version being clobberable", { + # none by default + expect_warning(regexp = NA, ea %>% as_of(max_version = max(ea$DT$version))) + expect_warning(regexp = NA, ea %>% as_of(max_version = min(ea$DT$version))) + # but with `clobberable_versions_start` non-`NA`, yes + ea_with_clobberable <- ea %>% clone() + ea_with_clobberable$clobberable_versions_start <- max(ea_with_clobberable$DT$version) + expect_warning(ea_with_clobberable %>% as_of(max_version = max(ea$DT$version))) + expect_warning(regexp = NA, ea_with_clobberable %>% as_of(max_version = min(ea$DT$version))) +}) + +test_that("as_of properly grabs the data and doesn't mutate key", { + d <- as.Date("2020-06-01") + + ea2 <- ea2_data %>% + as_epi_archive2() + + old_key <- data.table::key(ea2$DT) + + edf_as_of <- ea2 %>% + epix_as_of2(max_version = as.Date("2020-06-03")) + + edf_expected <- as_epi_df(tibble( + geo_value = "ca", + time_value = d + 0:2, + cases = c(2, 1, 1) + ), as_of = as.Date("2020-06-03")) + + expect_equal(edf_as_of, edf_expected, ignore_attr = c(".internal.selfref", "sorted")) + expect_equal(data.table::key(ea2$DT), old_key) +}) + +test_that("Errors are thrown due to bad epix_truncate_versions_after inputs", { + # x must be an archive + expect_error(epix_truncate_versions_after(data.frame(), as.Date("2020-01-01"))) + # max_version cannot be of string class rather than date class + expect_error(epix_truncate_versions_after(ea, "2020-01-01")) + # max_version cannot be a vector + expect_error(epix_truncate_versions_after(ea, c(as.Date("2020-01-01"), as.Date("2020-01-02")))) + # max_version cannot be missing + expect_error(epix_truncate_versions_after(ea, as.Date(NA))) + # max_version cannot be after latest version in archive + expect_error(epix_truncate_versions_after(ea, as.Date("2025-01-01"))) +}) + +test_that("epix_truncate_version_after properly grabs the data and doesn't mutate key", { + ea2 <- ea2_data %>% + as_epi_archive2() + + old_key <- data.table::key(ea2$DT) + + ea_as_of <- ea2 %>% + epix_truncate_versions_after(max_version = as.Date("2020-06-02")) + + ea_expected <- ea2_data[1:3, ] %>% + as_epi_archive2() + + expect_equal(ea_as_of, ea_expected, ignore_attr = c(".internal.selfref", "sorted")) + expect_equal(data.table::key(ea2$DT), old_key) +}) + +test_that("epix_truncate_version_after doesn't filter if max_verion at latest version", { + ea2 <- ea2_data %>% + as_epi_archive2() + + ea_expected <- ea2 %>% clone() + + ea_as_of <- ea2 %>% + epix_truncate_versions_after(max_version = as.Date("2020-06-04")) + expect_equal(ea_as_of, ea_expected, ignore_attr = c(".internal.selfref", "sorted")) +}) + +test_that("epix_truncate_version_after returns the same grouping type as input epi_archive", { + ea2 <- ea2_data %>% + as_epi_archive2() + + ea_as_of <- ea2 %>% + epix_truncate_versions_after(max_version = as.Date("2020-06-04")) + expect_true(is_epi_archive2(ea_as_of, grouped_okay = FALSE)) + + ea2_grouped <- ea2 %>% group_by(geo_value) + + ea_as_of <- ea2_grouped %>% + epix_truncate_versions_after(max_version = as.Date("2020-06-04")) + expect_true(is_grouped_epi_archive2(ea_as_of)) +}) + + +test_that("epix_truncate_version_after returns the same groups as input grouped_epi_archive", { + ea2 <- ea2_data %>% + as_epi_archive2() + ea2 <- ea2 %>% group_by(geo_value) + + ea_expected <- ea2 %>% clone() + + ea_as_of <- ea2 %>% + epix_truncate_versions_after(max_version = as.Date("2020-06-04")) + expect_equal(ea_as_of %>% groups(), ea_expected %>% groups()) +}) diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index d4fad3e7..c010c1f3 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -111,17 +111,17 @@ edf %>% edf %>% # pretend that observations about time_value t are reported in version t (nowcasts) mutate(version = time_value) %>% - as_epi_archive() %>% + as_epi_archive2() %>% group_by(geo_value) %>% - epix_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% + epix_slide2(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% ungroup() edf %>% # pretend that observations about time_value t are reported in version t (nowcasts) mutate(version = time_value) %>% - as_epi_archive() %>% + as_epi_archive2() %>% group_by(geo_value) %>% - epix_slide(~ mean(.x$x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% + epix_slide2(~ mean(.x$x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% ungroup() ``` @@ -219,9 +219,9 @@ edf %>% edf %>% mutate(version = time_value) %>% - as_epi_archive() %>% + as_epi_archive2() %>% group_by(geo_value) %>% - epix_slide( + epix_slide2( a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), ref_time_values = as.Date("2020-06-02"), before = 1, as_list_col = FALSE, names_sep = NULL @@ -317,16 +317,17 @@ x <- y1 %>% version = issue, percent_cli = value ) %>% - as_epi_archive(compactify = FALSE) + as_epi_archive2(compactify = FALSE) # mutating merge operation: -x$merge( +x <- epix_merge2( + x, y2 %>% select(geo_value, time_value, version = issue, case_rate_7d_av = value ) %>% - as_epi_archive(compactify = FALSE), + as_epi_archive2(compactify = FALSE), sync = "locf", compactify = FALSE ) @@ -337,9 +338,9 @@ library(data.table) library(ggplot2) theme_set(theme_bw()) -x <- archive_cases_dv_subset$DT %>% +x <- archive_cases_dv_subset_2$DT %>% filter(geo_value %in% c("ca", "fl")) %>% - as_epi_archive(compactify = FALSE) + as_epi_archive2(compactify = FALSE) ``` Next, we extend the ARX function to handle multiple geo values, since in the @@ -457,7 +458,7 @@ data. ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} # Latest snapshot of data, and forecast dates -x_latest <- epix_as_of(x, max_version = max(x$DT$version)) +x_latest <- epix_as_of2(x, max_version = max(x$DT$version)) fc_time_values <- seq(as.Date("2020-08-01"), as.Date("2021-11-30"), by = "1 month" @@ -467,7 +468,7 @@ fc_time_values <- seq(as.Date("2020-08-01"), k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { if (as_of) { x %>% - epix_slide( + epix_slide2( fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, .data$geo_value, .data$time_value, args = prob_arx_args(ahead = ahead) ), diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index fdb0e3c6..0b57d639 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -76,16 +76,16 @@ the compactify vignette. ```{r, eval=FALSE} x <- dv %>% select(geo_value, time_value, version = issue, percent_cli = value) %>% - as_epi_archive(compactify = TRUE) + as_epi_archive2(compactify = TRUE) class(x) print(x) ``` ```{r, echo=FALSE, message=FALSE, warning=FALSE} -x <- archive_cases_dv_subset$DT %>% +x <- archive_cases_dv_subset_2$DT %>% select(geo_value, time_value, version, percent_cli) %>% - as_epi_archive(compactify = TRUE) + as_epi_archive2(compactify = TRUE) class(x) print(x) @@ -154,7 +154,7 @@ function `epix_as_of()` since this is likely a more familiar interface for users not familiar with R6 (or object-oriented programming). ```{r} -x_snapshot <- epix_as_of(x, max_version = as.Date("2021-06-01")) +x_snapshot <- epix_as_of2(x, max_version = as.Date("2021-06-01")) class(x_snapshot) head(x_snapshot) max(x_snapshot$time_value) @@ -174,7 +174,7 @@ this case, since updates to the current version may still come in at a later point in time, due to various reasons, such as synchronization issues. ```{r} -x_latest <- epix_as_of(x, max_version = max(x$DT$version)) +x_latest <- epix_as_of2(x, max_version = max(x$DT$version)) ``` Below, we pull several snapshots from the archive, spaced one month apart. We @@ -188,7 +188,7 @@ theme_set(theme_bw()) self_max <- max(x$DT$version) versions <- seq(as.Date("2020-06-01"), self_max - 1, by = "1 month") snapshots <- map_dfr(versions, function(v) { - epix_as_of(x, max_version = v) %>% mutate(version = v) + epix_as_of2(x, max_version = v) %>% mutate(version = v) }) %>% bind_rows( x_latest %>% mutate(version = self_max) @@ -258,15 +258,15 @@ y <- pub_covidcast( issues = epirange(20200601, 20211201) ) %>% select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>% - as_epi_archive(compactify = TRUE) + as_epi_archive2(compactify = TRUE) -x$merge(y, sync = "locf", compactify = FALSE) +x <- epix_merge2(x, y, sync = "locf", compactify = TRUE) print(x) head(x$DT) ``` ```{r, echo=FALSE, message=FALSE, warning=FALSE} -x <- archive_cases_dv_subset +x <- archive_cases_dv_subset_2 print(x) head(x$DT) ``` @@ -362,7 +362,7 @@ fc_time_values <- seq(as.Date("2020-08-01"), z <- x %>% group_by(geo_value) %>% - epix_slide( + epix_slide2( fc = prob_arx(x = percent_cli, y = case_rate_7d_av), before = 119, ref_time_values = fc_time_values ) %>% @@ -389,14 +389,14 @@ points in time and forecast horizons. The former comes from using `epi_slide()` to the latest snapshot of the data `x_latest`. ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} -x_latest <- epix_as_of(x, max_version = max(x$DT$version)) +x_latest <- epix_as_of2(x, max_version = max(x$DT$version)) # Simple function to produce forecasts k weeks ahead k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { if (as_of) { x %>% group_by(.data$geo_value) %>% - epix_slide( + epix_slide2( fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, ahead = ahead), before = 119, ref_time_values = fc_time_values ) %>% diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd index cad065e7..0b68c73b 100644 --- a/vignettes/compactify.Rmd +++ b/vignettes/compactify.Rmd @@ -32,10 +32,10 @@ from the second from the third value included. library(epiprocess) library(dplyr) -dt <- archive_cases_dv_subset$DT +dt <- archive_cases_dv_subset_2$DT -locf_omitted <- as_epi_archive(dt) -locf_included <- as_epi_archive(dt, compactify = FALSE) +locf_omitted <- as_epi_archive2(dt) +locf_included <- as_epi_archive2(dt, compactify = FALSE) head(locf_omitted$DT) head(locf_included$DT) @@ -48,8 +48,8 @@ LOCF-redundant values can mar the performance of dataset operations. As the colu ```{r} dt2 <- select(dt, -percent_cli) -locf_included_2 <- as_epi_archive(dt2, compactify = FALSE) -locf_omitted_2 <- as_epi_archive(dt2, compactify = TRUE) +locf_included_2 <- as_epi_archive2(dt2, compactify = FALSE) +locf_omitted_2 <- as_epi_archive2(dt2, compactify = TRUE) ``` In this example, a huge proportion of the original version update data were From 74aa8311731c2283c587cbd0934c26bbb26db445 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Tue, 19 Mar 2024 18:09:32 -0700 Subject: [PATCH 02/18] feat: replace epi_archive with S3 implementation --- DESCRIPTION | 6 +- NAMESPACE | 40 +- NEWS.md | 6 +- R/archive.R | 1083 +++++++--------- R/archive_new.R | 1115 ----------------- R/data.R | 12 +- R/group_by_epi_df_methods.R | 4 - R/grouped_archive_new.R | 456 ------- R/grouped_epi_archive.R | 756 ++++++----- R/growth_rate.R | 4 +- R/methods-epi_archive.R | 584 ++++----- R/methods-epi_archive_new.R | 826 ------------ R/utils.R | 2 - _pkgdown.yml | 3 + man/as_epi_archive.Rd | 142 --- man/as_epi_archive2.Rd | 142 --- man/as_of.epi_archive2.Rd | 33 - man/clone.Rd | 17 + man/compactify.Rd | 28 + man/epi_archive.Rd | 645 ++-------- man/epix_as_of.Rd | 26 +- man/epix_as_of2.Rd | 95 -- man/epix_fill_through_version.Rd | 10 +- man/epix_fill_through_version2.Rd | 48 - man/epix_merge.Rd | 23 +- man/epix_merge2.Rd | 71 -- man/epix_slide.Rd | 64 +- man/epix_slide2.Rd | 283 ----- man/epix_truncate_versions_after.Rd | 21 +- ...ate_versions_after.grouped_epi_archive2.Rd | 11 - man/fill_through_version.epi_archive2.Rd | 21 - man/group_by.epi_archive.Rd | 46 +- man/group_by.epi_archive2.Rd | 147 --- man/is_epi_archive.Rd | 2 +- man/is_epi_archive2.Rd | 35 - man/max_version_with_row_in.Rd | 9 +- man/merge_epi_archive2.Rd | 30 - man/new_epi_archive2.Rd | 69 - man/next_after.Rd | 8 +- ...t.epi_archive2.Rd => print.epi_archive.Rd} | 12 +- man/slide.epi_archive2.Rd | 101 -- man/slide.grouped_epi_archive2.Rd | 24 - man/truncate_versions_after.epi_archive2.Rd | 19 - ...ate_versions_after.grouped_epi_archive2.Rd | 18 - tests/testthat/test-archive-version-bounds.R | 10 +- tests/testthat/test-archive_new.R | 173 --- tests/testthat/test-compactify.R | 6 +- tests/testthat/test-compactify_new.R | 110 -- tests/testthat/test-deprecations.R | 20 +- .../testthat/test-epix_fill_through_version.R | 56 +- .../test-epix_fill_through_version_new.R | 109 -- tests/testthat/test-epix_merge.R | 7 +- tests/testthat/test-epix_merge_new.R | 226 ---- tests/testthat/test-epix_slide.R | 150 ++- tests/testthat/test-epix_slide_new.R | 810 ------------ tests/testthat/test-grouped_epi_archive.R | 4 - tests/testthat/test-grouped_epi_archive_new.R | 104 -- tests/testthat/test-methods-epi_archive.R | 41 +- tests/testthat/test-methods-epi_archive_new.R | 136 -- tests/testthat/test-utils.R | 10 - vignettes/advanced.Rmd | 42 +- vignettes/aggregation.Rmd | 6 +- vignettes/archive.Rmd | 105 +- vignettes/compactify.Rmd | 15 +- vignettes/epiprocess.Rmd | 13 +- 65 files changed, 1528 insertions(+), 7722 deletions(-) delete mode 100644 R/archive_new.R delete mode 100644 R/grouped_archive_new.R delete mode 100644 R/methods-epi_archive_new.R delete mode 100644 man/as_epi_archive.Rd delete mode 100644 man/as_epi_archive2.Rd delete mode 100644 man/as_of.epi_archive2.Rd create mode 100644 man/clone.Rd create mode 100644 man/compactify.Rd delete mode 100644 man/epix_as_of2.Rd delete mode 100644 man/epix_fill_through_version2.Rd delete mode 100644 man/epix_merge2.Rd delete mode 100644 man/epix_slide2.Rd delete mode 100644 man/epix_truncate_versions_after.grouped_epi_archive2.Rd delete mode 100644 man/fill_through_version.epi_archive2.Rd delete mode 100644 man/group_by.epi_archive2.Rd delete mode 100644 man/is_epi_archive2.Rd delete mode 100644 man/merge_epi_archive2.Rd delete mode 100644 man/new_epi_archive2.Rd rename man/{print.epi_archive2.Rd => print.epi_archive.Rd} (56%) delete mode 100644 man/slide.epi_archive2.Rd delete mode 100644 man/slide.grouped_epi_archive2.Rd delete mode 100644 man/truncate_versions_after.epi_archive2.Rd delete mode 100644 man/truncate_versions_after.grouped_epi_archive2.Rd delete mode 100644 tests/testthat/test-archive_new.R delete mode 100644 tests/testthat/test-compactify_new.R delete mode 100644 tests/testthat/test-epix_fill_through_version_new.R delete mode 100644 tests/testthat/test-epix_merge_new.R delete mode 100644 tests/testthat/test-epix_slide_new.R delete mode 100644 tests/testthat/test-grouped_epi_archive_new.R delete mode 100644 tests/testthat/test-methods-epi_archive_new.R diff --git a/DESCRIPTION b/DESCRIPTION index cfdd9f49..2b53474c 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -39,7 +39,6 @@ Imports: lubridate, magrittr, purrr, - R6, rlang, slider, tibble, @@ -50,7 +49,9 @@ Imports: vctrs Suggests: covidcast, + devtools, epidatr, + here, knitr, outbreaks, rmarkdown, @@ -73,7 +74,6 @@ Depends: URL: https://cmu-delphi.github.io/epiprocess/ Collate: 'archive.R' - 'archive_new.R' 'autoplot.R' 'correlation.R' 'data.R' @@ -81,11 +81,9 @@ Collate: 'epiprocess.R' 'group_by_epi_df_methods.R' 'methods-epi_archive.R' - 'grouped_archive_new.R' 'grouped_epi_archive.R' 'growth_rate.R' 'key_colnames.R' - 'methods-epi_archive_new.R' 'methods-epi_df.R' 'outliers.R' 'reexports.R' diff --git a/NAMESPACE b/NAMESPACE index d5d1cd7b..cc25c7d7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,98 +6,78 @@ S3method(as_epi_df,data.frame) S3method(as_epi_df,epi_df) S3method(as_epi_df,tbl_df) S3method(as_epi_df,tbl_ts) -S3method(as_of,epi_archive2) S3method(as_tibble,epi_df) S3method(as_tsibble,epi_df) S3method(autoplot,epi_df) -S3method(clone,epi_archive2) -S3method(clone,grouped_epi_archive2) +S3method(clone,epi_archive) +S3method(clone,grouped_epi_archive) S3method(dplyr_col_modify,col_modify_recorder_df) S3method(dplyr_col_modify,epi_df) S3method(dplyr_reconstruct,epi_df) S3method(dplyr_row_slice,epi_df) +S3method(epix_slide,epi_archive) +S3method(epix_slide,grouped_epi_archive) S3method(epix_truncate_versions_after,epi_archive) -S3method(epix_truncate_versions_after,epi_archive2) S3method(epix_truncate_versions_after,grouped_epi_archive) -S3method(epix_truncate_versions_after,grouped_epi_archive2) S3method(group_by,epi_archive) -S3method(group_by,epi_archive2) S3method(group_by,epi_df) S3method(group_by,grouped_epi_archive) -S3method(group_by,grouped_epi_archive2) S3method(group_by_drop_default,grouped_epi_archive) -S3method(group_by_drop_default,grouped_epi_archive2) S3method(group_modify,epi_df) S3method(groups,grouped_epi_archive) -S3method(groups,grouped_epi_archive2) S3method(key_colnames,data.frame) S3method(key_colnames,default) S3method(key_colnames,epi_archive) S3method(key_colnames,epi_df) S3method(next_after,Date) S3method(next_after,integer) -S3method(print,epi_archive2) +S3method(print,epi_archive) S3method(print,epi_df) -S3method(print,grouped_epi_archive2) +S3method(print,grouped_epi_archive) S3method(select,epi_df) -S3method(slide,grouped_epi_archive2) S3method(summary,epi_df) -S3method(truncate_versions_after,grouped_epi_archive2) S3method(ungroup,epi_df) S3method(ungroup,grouped_epi_archive) -S3method(ungroup,grouped_epi_archive2) S3method(unnest,epi_df) export("%>%") export(archive_cases_dv_subset) -export(archive_cases_dv_subset_2) export(arrange) export(as_epi_archive) -export(as_epi_archive2) export(as_epi_df) -export(as_of) export(as_tsibble) export(autoplot) export(clone) export(detect_outlr) export(detect_outlr_rm) export(detect_outlr_stl) -export(epi_archive) export(epi_cor) export(epi_slide) export(epi_slide_mean) export(epi_slide_opt) export(epi_slide_sum) export(epix_as_of) -export(epix_as_of2) +export(epix_fill_through_version) export(epix_merge) -export(epix_merge2) export(epix_slide) -export(epix_slide2) export(epix_truncate_versions_after) -export(fill_through_version) export(filter) export(group_by) export(group_modify) export(growth_rate) export(is_epi_archive) -export(is_epi_archive2) export(is_epi_df) export(is_grouped_epi_archive) -export(is_grouped_epi_archive2) export(key_colnames) export(max_version_with_row_in) export(mutate) -export(new_epi_archive2) +export(new_epi_archive) export(new_epi_df) export(next_after) export(relocate) export(rename) export(slice) -export(slide) -export(truncate_versions_after) export(ungroup) export(unnest) -importFrom(R6,R6Class) importFrom(checkmate,anyInfinite) importFrom(checkmate,anyMissing) importFrom(checkmate,assert) @@ -138,12 +118,15 @@ importFrom(dplyr,bind_rows) importFrom(dplyr,dplyr_col_modify) importFrom(dplyr,dplyr_reconstruct) importFrom(dplyr,dplyr_row_slice) +importFrom(dplyr,everything) importFrom(dplyr,filter) importFrom(dplyr,group_by) importFrom(dplyr,group_by_drop_default) importFrom(dplyr,group_modify) importFrom(dplyr,group_vars) importFrom(dplyr,groups) +importFrom(dplyr,if_all) +importFrom(dplyr,if_any) importFrom(dplyr,mutate) importFrom(dplyr,relocate) importFrom(dplyr,rename) @@ -166,6 +149,7 @@ importFrom(rlang,arg_match) importFrom(rlang,as_label) importFrom(rlang,caller_arg) importFrom(rlang,caller_env) +importFrom(rlang,check_dots_empty) importFrom(rlang,enquo) importFrom(rlang,enquos) importFrom(rlang,env) diff --git a/NEWS.md b/NEWS.md index e2c5b8e4..4d52ded5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -32,7 +32,11 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat ## Breaking changes - Switched `epi_df`'s `other_keys` default from `NULL` to `character(0)`; PR #390 -- Refactor `epi_archive` to use S3 instead of R6 for its object model. The calls to some functions will change, but the functionality will remain the same. It will also help us maintain the package better in the future. (#340) +- Refactor `epi_archive` to use S3 instead of R6 for its object model. The + functionality stay the same, but it will break the member function interface. + For migration, convert `epi_archive$merge` to `epi_archive %>% epix_merge` + (similar for `slide`, `fill_through_version`, `truncate_after_version`, and + `as_of`) (#340). # epiprocess 0.7.0 diff --git a/R/archive.R b/R/archive.R index a530cc05..f871d239 100644 --- a/R/archive.R +++ b/R/archive.R @@ -6,6 +6,7 @@ # `data.table::` everywhere and not importing things. .datatable_aware <- TRUE + #' Validate a version bound arg #' #' Expected to be used on `clobberable_versions_start`, `versions_end`, @@ -49,6 +50,7 @@ validate_version_bound <- function(version_bound, x, na_ok = FALSE, return(invisible(NULL)) } + #' `max(x$version)`, with error if `x` has 0 rows #' #' Exported to make defaults more easily copyable. @@ -82,6 +84,7 @@ max_version_with_row_in <- function(x) { version_bound } + #' Get the next possible value greater than `x` of the same type #' #' @param x the starting "value"(s) @@ -90,22 +93,53 @@ max_version_with_row_in <- function(x) { #' @export next_after <- function(x) UseMethod("next_after") + #' @export next_after.integer <- function(x) x + 1L + #' @export next_after.Date <- function(x) x + 1L + +#' Compactify +#' +#' This section describes the internals of how compactification works in an +#' `epi_archive()`. Compactification can potentially improve code speed or +#' memory usage, depending on your data. +#' +#' In general, the last version of each observation is carried forward (LOCF) to +#' fill in data between recorded versions, and between the last recorded +#' update and the `versions_end`. One consequence is that the `DT` doesn't +#' have to contain a full snapshot of every version (although this generally +#' works), but can instead contain only the rows that are new or changed from +#' the previous version (see `compactify`, which does this automatically). +#' Currently, deletions must be represented as revising a row to a special +#' state (e.g., making the entries `NA` or including a special column that +#' flags the data as removed and performing some kind of post-processing), and +#' the archive is unaware of what this state is. Note that `NA`s *can* be +#' introduced by `epi_archive` methods for other reasons, e.g., in +#' [`epix_fill_through_version`] and [`epix_merge`], if requested, to +#' represent potential update data that we do not yet have access to; or in +#' [`epix_merge`] to represent the "value" of an observation before the +#' version in which it was first released, or if no version of that +#' observation appears in the archive data at all. +#' +#' @name compactify +NULL + + +#' Epi Archive +#' #' @title `epi_archive` object #' -#' @description An `epi_archive` is an R6 class which contains a data table +#' @description An `epi_archive` is an S3 class which contains a data table #' along with several relevant pieces of metadata. The data table can be seen #' as the full archive (version history) for some signal variables of #' interest. #' -#' @details An `epi_archive` is an R6 class which contains a data table `DT`, of -#' class `data.table` from the `data.table` package, with (at least) the -#' following columns: +#' @details An `epi_archive` contains a data table `DT`, of class `data.table` +#' from the `data.table` package, with (at least) the following columns: #' #' * `geo_value`: the geographic value associated with each row of measurements. #' * `time_value`: the time value associated with each row of measurements. @@ -118,38 +152,12 @@ next_after.Date <- function(x) x + 1L #' The data table `DT` has key variables `geo_value`, `time_value`, `version`, #' as well as any others (these can be specified when instantiating the #' `epi_archive` object via the `other_keys` argument, and/or set by operating -#' on `DT` directly). Refer to the documentation for [as_epi_archive()] for -#' information and examples of relevant parameter names for an `epi_archive` object. -#' Note that there can only be a single row per unique combination of +#' on `DT` directly). Refer to the documentation for `as_epi_archive()` for +#' information and examples of relevant parameter names for an `epi_archive` +#' object. Note that there can only be a single row per unique combination of #' key variables, and thus the key variables are critical for figuring out how #' to generate a snapshot of data from the archive, as of a given version. #' -#' In general, the last version of each observation is carried forward (LOCF) to -#' fill in data between recorded versions, and between the last recorded -#' update and the `versions_end`. One consequence is that the `DT` -#' doesn't have to contain a full snapshot of every version (although this -#' generally works), but can instead contain only the rows that are new or -#' changed from the previous version (see `compactify`, which does this -#' automatically). Currently, deletions must be represented as revising a row -#' to a special state (e.g., making the entries `NA` or including a special -#' column that flags the data as removed and performing some kind of -#' post-processing), and the archive is unaware of what this state is. Note -#' that `NA`s *can* be introduced by `epi_archive` methods for other reasons, -#' e.g., in [`epix_fill_through_version`] and [`epix_merge`], if requested, to -#' represent potential update data that we do not yet have access to; or in -#' [`epix_merge`] to represent the "value" of an observation before the -#' version in which it was first released, or if no version of that -#' observation appears in the archive data at all. -#' -#' **A word of caution:** R6 objects, unlike most other objects in R, have -#' reference semantics. A primary consequence of this is that objects are not -#' copied when modified. You can read more about this in Hadley Wickham's -#' [Advanced R](https://adv-r.hadley.nz/r6.html#r6-semantics) book. In order -#' to construct a modified archive while keeping the original intact, first -#' make a clone using the `$clone` method, then overwrite the clone's `DT` -#' field with `data.table::copy(clone$DT)`, and finally perform the -#' modifications on the clone. -#' #' @section Metadata: #' The following pieces of metadata are included as fields in an `epi_archive` #' object: @@ -168,9 +176,8 @@ next_after.Date <- function(x) x + 1L #' @section Generating Snapshots: #' An `epi_archive` object can be used to generate a snapshot of the data in #' `epi_df` format, which represents the most up-to-date values of the signal -#' variables, as of the specified version. This is accomplished by calling the -#' `as_of()` method for an `epi_archive` object `x`. More details on this -#' method are documented in the wrapper function [`epix_as_of()`]. +#' variables, as of the specified version. This is accomplished by calling +#' `epix_as_of()`. #' #' @section Sliding Computations: #' We can run a sliding computation over an `epi_archive` object, much like @@ -179,595 +186,9 @@ next_after.Date <- function(x) x + 1L #' the way `epi_slide()` works for an `epi_df` object, but with one key #' difference: it is version-aware. That is, for an `epi_archive` object, the #' sliding computation at any given reference time point t is performed on -#' **data that would have been available as of t**. More details on `slide()` -#' are documented in the wrapper function [`epix_slide()`]. -#' -#' @importFrom R6 R6Class -#' @export -#' @examples -#' tib <- tibble::tibble( -#' geo_value = rep(c("ca", "hi"), each = 5), -#' time_value = rep(seq(as.Date("2020-01-01"), -#' by = 1, length.out = 5 -#' ), times = 2), -#' version = rep(seq(as.Date("2020-01-02"), -#' by = 1, length.out = 5 -#' ), times = 2), -#' value = rnorm(10, mean = 2, sd = 1) -#' ) +#' **data that would have been available as of t**. #' -#' toy_epi_archive <- tib %>% epi_archive$new( -#' geo_type = "state", -#' time_type = "day" -#' ) -#' toy_epi_archive -epi_archive <- - R6::R6Class( - classname = "epi_archive", - ##### - public = list( - #' @field DT (`data.table`)\cr - #' the (optionally compactified) datatable - DT = NULL, - #' @field geo_type (string)\cr - #' the resolution of the geographic label (e.g. state) - geo_type = NULL, - #' @field time_type (string)\cr - #' the resolution of the time column (e.g. day) - time_type = NULL, - #' @field additional_metadata (named list)\cr - #' any extra fields, such as `other_keys` - additional_metadata = NULL, - #' @field clobberable_versions_start (length-1 of same type&class as `version` column, or `NA`)\cr - #' the earliest version number that might be rewritten in the future without assigning a new version - #' date/number, or `NA` if this won't happen - clobberable_versions_start = NULL, - #' @field versions_end (length-1 of same type&class as `version` column)\cr - #' the latest version observed - versions_end = NULL, - #' @description Creates a new `epi_archive` object. - #' @param x A data frame, data table, or tibble, with columns `geo_value`, - #' `time_value`, `version`, and then any additional number of columns. - #' @param geo_type Type for the geo values. If missing, then the function will - #' attempt to infer it from the geo values present; if this fails, then it - #' will be set to "custom". - #' @param time_type Type for the time values. If missing, then the function will - #' attempt to infer it from the time values present; if this fails, then it - #' will be set to "custom". - #' @param other_keys Character vector specifying the names of variables in `x` - #' that should be considered key variables (in the language of `data.table`) - #' apart from "geo_value", "time_value", and "version". - #' @param additional_metadata List of additional metadata to attach to the - #' `epi_archive` object. The metadata will have `geo_type` and `time_type` - #' fields; named entries from the passed list or will be included as well. - #' @param compactify Optional; Boolean or `NULL`: should we remove rows that are - #' considered redundant for the purposes of `epi_archive`'s built-in methods - #' such as `as_of`? As these methods use the last version of each observation - #' carried forward (LOCF) to interpolate between the version data provided, - #' rows that don't change these LOCF results can potentially be omitted to - #' save space while maintaining the same behavior (with the help of the - #' `clobberable_versions_start` and `versions_end` fields in some edge cases). - #' `TRUE` will remove these rows, `FALSE` will not, and missing or `NULL` will - #' remove these rows and issue a warning. Generally, this can be set to - #' `TRUE`, but if you directly inspect or edit the fields of the `epi_archive` - #' such as its `DT`, or rely on redundant updates to achieve a certain - #' behavior of the `ref_time_values` default in `epix_slide`, you will have to - #' determine whether `compactify=TRUE` will produce the desired results. If - #' compactification here is removing a large proportion of the rows, this may - #' indicate a potential for space, time, or bandwidth savings upstream the - #' data pipeline, e.g., by avoiding fetching, storing, or processing these - #' rows of `x`. - #' @param clobberable_versions_start Optional; as in [`as_epi_archive`] - #' @param versions_end Optional; as in [`as_epi_archive`] - #' @return An `epi_archive` object. - #' @importFrom data.table as.data.table key setkeyv - #' - #' @details - #' Refer to the documentation for [as_epi_archive()] for more information - #' and examples of parameter names. - initialize = function(x, geo_type, time_type, other_keys, - additional_metadata, compactify, - clobberable_versions_start, versions_end) { - assert_data_frame(x) - if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { - cli_abort( - "Columns `geo_value`, `time_value`, and `version` must be present in `x`." - ) - } - if (anyMissing(x$version)) { - cli_abort("Column `version` must not contain missing values.") - } - - # If geo type is missing, then try to guess it - if (missing(geo_type)) { - geo_type <- guess_geo_type(x$geo_value) - } - - # If time type is missing, then try to guess it - if (missing(time_type)) { - time_type <- guess_time_type(x$time_value) - } - - # Finish off with small checks on keys variables and metadata - if (missing(other_keys)) other_keys <- NULL - if (missing(additional_metadata)) additional_metadata <- list() - if (!test_subset(other_keys, names(x))) { - cli_abort("`other_keys` must be contained in the column names of `x`.") - } - if (any(c("geo_value", "time_value", "version") %in% other_keys)) { - cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") - } - if (any(names(additional_metadata) %in% c("geo_type", "time_type"))) { - cli_warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") - } - - # Conduct checks and apply defaults for `compactify` - if (missing(compactify)) { - compactify <- NULL - } - assert_logical(compactify, len = 1, null.ok = TRUE) - - # Apply defaults and conduct checks for - # `clobberable_versions_start`, `versions_end`: - if (missing(clobberable_versions_start)) { - clobberable_versions_start <- NA - } - if (missing(versions_end)) { - versions_end <- max_version_with_row_in(x) - } - validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) - validate_version_bound(versions_end, x, na_ok = FALSE) - if (nrow(x) > 0L && versions_end < max(x[["version"]])) { - cli_abort( - sprintf( - "`versions_end` was %s, but `x` contained - updates for a later version or versions, up through %s", - versions_end, max(x[["version"]]) - ), - class = "epiprocess__versions_end_earlier_than_updates" - ) - } - if (!is.na(clobberable_versions_start) && clobberable_versions_start > versions_end) { - cli_abort( - sprintf( - "`versions_end` was %s, but a `clobberable_versions_start` - of %s indicated that there were later observed versions", - versions_end, clobberable_versions_start - ), - class = "epiprocess__versions_end_earlier_than_clobberable_versions_start" - ) - } - - # --- End of validation and replacing missing args with defaults --- - - # Create the data table; if x was an un-keyed data.table itself, - # then the call to as.data.table() will fail to set keys, so we - # need to check this, then do it manually if needed - key_vars <- c("geo_value", "time_value", other_keys, "version") - DT <- as.data.table(x, key = key_vars) # nolint: object_name_linter - if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) - - maybe_first_duplicate_key_row_index <- anyDuplicated(DT, by = key(DT)) - if (maybe_first_duplicate_key_row_index != 0L) { - cli_abort("`x` must have one row per unique combination of the key variables. If you - have additional key variables other than `geo_value`, `time_value`, and - `version`, such as an age group column, please specify them in `other_keys`. - Otherwise, check for duplicate rows and/or conflicting values for the same - measurement.", - class = "epiprocess__epi_archive_requires_unique_key" - ) - } - - # Checks to see if a value in a vector is LOCF - is_locf <- function(vec) { - dplyr::if_else(!is.na(vec) & !is.na(dplyr::lag(vec)), - vec == dplyr::lag(vec), - is.na(vec) & is.na(dplyr::lag(vec)) - ) - } - - # LOCF is defined by a row where all values except for the version - # differ from their respective lag values - - # Checks for LOCF's in a data frame - rm_locf <- function(df) { - dplyr::filter(df, if_any(c(everything(), -version), ~ !is_locf(.))) - } - - # Keeps LOCF values, such as to be printed - keep_locf <- function(df) { - dplyr::filter(df, if_all(c(everything(), -version), ~ is_locf(.))) - } - - # Runs compactify on data frame - if (is.null(compactify) || compactify == TRUE) { - elim <- keep_locf(DT) - DT <- rm_locf(DT) # nolint: object_name_linter - } else { - # Create empty data frame for nrow(elim) to be 0 - elim <- tibble::tibble() - } - - # Warns about redundant rows - if (is.null(compactify) && nrow(elim) > 0) { - warning_intro <- cli::format_inline( - "Found rows that appear redundant based on - last (version of each) observation carried forward; - these rows have been removed to 'compactify' and save space:", - keep_whitespace = FALSE - ) - warning_data <- paste(collapse = "\n", capture.output(print(elim, topn = 3L, nrows = 7L))) - warning_outro <- cli::format_inline( - "Built-in `epi_archive` functionality should be unaffected, - but results may change if you work directly with its fields (such as `DT`). - See `?as_epi_archive` for details. - To silence this warning but keep compactification, - you can pass `compactify=TRUE` when constructing the archive.", - keep_whitespace = FALSE - ) - warning_message <- paste(sep = "\n", warning_intro, warning_data, warning_outro) - rlang::warn(warning_message, class = "epiprocess__compactify_default_removed_rows") - } - - # Instantiate all self variables - self$DT <- DT - self$geo_type <- geo_type - self$time_type <- time_type - self$additional_metadata <- additional_metadata - self$clobberable_versions_start <- clobberable_versions_start - self$versions_end <- versions_end - }, - #' Print information about an archive - #' @param class Boolean; whether to print the class label header - #' @param methods Boolean; whether to print all available methods of - #' the archive - #' @importFrom cli cli_inform - print = function(class = TRUE, methods = TRUE) { - cli_inform( - c( - ">" = if (class) "An `epi_archive` object, with metadata:", - "i" = if (length(setdiff(key(self$DT), c("geo_value", "time_value", "version"))) > 0) { - "Non-standard DT keys: {setdiff(key(self$DT), c('geo_value', 'time_value', 'version'))}" - }, - "i" = "Min/max time values: {min(self$DT$time_value)} / {max(self$DT$time_value)}", - "i" = "First/last version with update: {min(self$DT$version)} / {max(self$DT$version)}", - "i" = if (!is.na(self$clobberable_versions_start)) { - "Clobberable versions start: {self$clobberable_versions_start}" - }, - "i" = "Versions end: {self$versions_end}", - "i" = if (methods) "Public R6 methods: {names(epi_archive$public_methods)}", - "i" = "A preview of the table ({nrow(self$DT)} rows x {ncol(self$DT)} columns):" - ) - ) - - return(invisible(self$DT %>% print())) - }, - ##### - #' @description Generates a snapshot in `epi_df` format as of a given version. - #' See the documentation for the wrapper function [`epix_as_of()`] for - #' details. The parameter descriptions below are copied from there - #' @param x An `epi_archive` object - #' @param max_version Version specifying the max version to permit in the - #' snapshot. That is, the snapshot will comprise the unique rows of the - #' current archive data that represent the most up-to-date signal values, as - #' of the specified `max_version` (and whose `time_value`s are at least - #' `min_time_value`). - #' @param min_time_value Time value specifying the min `time_value` to permit in - #' the snapshot. Default is `-Inf`, which effectively means that there is no - #' minimum considered. - #' @param all_versions Boolean; If `all_versions = TRUE`, then the output will be in - #' `epi_archive` format, and contain rows in the specified `time_value` range - #' having `version <= max_version`. The resulting object will cover a - #' potentially narrower `version` and `time_value` range than `x`, depending - #' on user-provided arguments. Otherwise, there will be one row in the output - #' for the `max_version` of each `time_value`. Default is `FALSE`. - #' @importFrom data.table between key - as_of = function(max_version, min_time_value = -Inf, all_versions = FALSE) { - # Self max version and other keys - other_keys <- setdiff( - key(self$DT), - c("geo_value", "time_value", "version") - ) - if (length(other_keys) == 0) other_keys <- NULL - - # Check a few things on max_version - if (!test_set_equal(class(max_version), class(self$DT$version))) { - cli_abort( - "`max_version` must have the same classes as `self$DT$version`." - ) - } - if (!test_set_equal(typeof(max_version), typeof(self$DT$version))) { - cli_abort( - "`max_version` must have the same types as `self$DT$version`." - ) - } - assert_scalar(max_version, na.ok = FALSE) - if (max_version > self$versions_end) { - cli_abort("`max_version` must be at most `self$versions_end`.") - } - assert_logical(all_versions, len = 1) - if (!is.na(self$clobberable_versions_start) && max_version >= self$clobberable_versions_start) { - cli_warn( - 'Getting data as of some recent version which could still be - overwritten (under routine circumstances) without assigning a new - version number (a.k.a. "clobbered"). Thus, the snapshot that we - produce here should not be expected to be reproducible later. See - `?epi_archive` for more info and `?epix_as_of` on how to muffle.', - class = "epiprocess__snapshot_as_of_clobberable_version" - ) - } - - # Filter by version and return - if (all_versions) { - result <- epix_truncate_versions_after(self, max_version) - # `self` has already been `clone`d in `epix_truncate_versions_after` - # so we can modify the new archive's DT directly. - result$DT <- result$DT[time_value >= min_time_value, ] - return(result) - } - - return( - # Make sure to use data.table ways of filtering and selecting - self$DT[time_value >= min_time_value & version <= max_version, ] %>% - unique( - by = c("geo_value", "time_value", other_keys), - fromLast = TRUE - ) %>% - tibble::as_tibble() %>% - dplyr::select(-"version") %>% - as_epi_df( - geo_type = self$geo_type, - time_type = self$time_type, - as_of = max_version, - additional_metadata = c(self$additional_metadata, - other_keys = other_keys - ) - ) - ) - }, - ##### - #' @description Fill in unobserved history using requested scheme by mutating - #' `self` and potentially reseating its fields. See - #' [`epix_fill_through_version`] for a full description of the non-R6-method - #' version, which doesn't mutate the input archive but might alias its fields. - #' - #' @param fill_versions_end as in [`epix_fill_through_version`] - #' @param how as in [`epix_fill_through_version`] - #' - #' @importFrom data.table key setkeyv := address copy - #' @importFrom rlang arg_match - fill_through_version = function(fill_versions_end, - how = c("na", "locf")) { - validate_version_bound(fill_versions_end, self$DT, na_ok = FALSE) - how <- arg_match(how) - if (self$versions_end < fill_versions_end) { - new_DT <- switch(how, # nolint: object_name_linter - "na" = { - # old DT + a version consisting of all NA observations - # immediately after the last currently/actually-observed - # version. Note that this NA-observation version must only be - # added if `self` is outdated. - nonversion_key_cols <- setdiff(key(self$DT), "version") - nonkey_cols <- setdiff(names(self$DT), key(self$DT)) - next_version_tag <- next_after(self$versions_end) - if (next_version_tag > fill_versions_end) { - cli_abort(sprintf(paste( - "Apparent problem with `next_after` method:", - "archive contained observations through version %s", - "and the next possible version was supposed to be %s,", - "but this appeared to jump from a version < %3$s", - "to one > %3$s, implying at least one version in between." - ), self$versions_end, next_version_tag, fill_versions_end)) - } - nonversion_key_vals_ever_recorded <- unique(self$DT, by = nonversion_key_cols) - # In edge cases, the `unique` result can alias the original - # DT; detect and copy if necessary: - if (identical(address(self$DT), address(nonversion_key_vals_ever_recorded))) { - nonversion_key_vals_ever_recorded <- copy(nonversion_key_vals_ever_recorded) - } - next_version_DT <- nonversion_key_vals_ever_recorded[ # nolint: object_name_linter - , version := next_version_tag - ][ - # this makes the class of these columns logical (`NA` is a - # logical NA; we're relying on the rbind below to convert to - # the proper class&typeof) - , (nonkey_cols) := NA - ] - # full result DT: - setkeyv(rbind(self$DT, next_version_DT), key(self$DT))[] - }, - "locf" = { - # just the old DT; LOCF is built into other methods: - self$DT - } - ) - new_versions_end <- fill_versions_end - # Update `self` all at once with simple, error-free operations + - # return below: - self$DT <- new_DT - self$versions_end <- new_versions_end - } else { - # Already sufficiently up to date; nothing to do. - } - return(invisible(self)) - }, - ##### - #' @description Filter to keep only older versions, mutating the archive by - #' potentially reseating but not mutating some fields. `DT` is likely, but not - #' guaranteed, to be copied. Returns the mutated archive - #' [invisibly][base::invisible]. - #' @param x as in [`epix_truncate_versions_after`] - #' @param max_version as in [`epix_truncate_versions_after`] - truncate_versions_after = function(max_version) { - if (!test_set_equal(class(max_version), class(self$DT$version))) { - cli_abort("`max_version` must have the same classes as `self$DT$version`.") - } - if (!test_set_equal(typeof(max_version), typeof(self$DT$version))) { - cli_abort("`max_version` must have the same types as `self$DT$version`.") - } - assert_scalar(max_version, na.ok = FALSE) - if (max_version > self$versions_end) { - cli_abort("`max_version` must be at most `self$versions_end`.") - } - self$DT <- self$DT[self$DT$version <= max_version, colnames(self$DT), with = FALSE] - # (^ this filter operation seems to always copy the DT, even if it - # keeps every entry; we don't guarantee this behavior in - # documentation, though, so we could change to alias in this case) - if (!is.na(self$clobberable_versions_start) && self$clobberable_versions_start > max_version) { - self$clobberable_versions_start <- NA - } - self$versions_end <- max_version - return(invisible(self)) - }, - ##### - #' @description Merges another `epi_archive` with the current one, mutating the - #' current one by reseating its `DT` and several other fields, but avoiding - #' mutation of the old `DT`; returns the current archive - #' [invisibly][base::invisible]. See [`epix_merge`] for a full description - #' of the non-R6-method version, which does not mutate either archive, and - #' does not alias either archive's `DT`. - #' @param y as in [`epix_merge`] - #' @param sync as in [`epix_merge`] - #' @param compactify as in [`epix_merge`] - merge = function(y, sync = c("forbid", "na", "locf", "truncate"), compactify = TRUE) { - result <- epix_merge(self, y, - sync = sync, - compactify = compactify - ) - - if (length(epi_archive$private_fields) != 0L) { - cli_abort("expected no private fields in epi_archive", - internal = TRUE - ) - } - - # Mutate fields all at once, trying to avoid any potential errors: - for (field_name in names(epi_archive$public_fields)) { - self[[field_name]] <- result[[field_name]] - } - - return(invisible(self)) - }, - #' group an epi_archive - #' @description - #' group an epi_archive - #' @param ... variables or computations to group by. Computations are always - #' done on the ungrouped data frame. To perform computations on the grouped - #' data, you need to use a separate [`mutate()`] step before the - #' [`group_by()`] - #' @param .add When `FALSE`, the default, [`group_by()`] will override existing - #' groups. To add to the existing groups, use `.add = TRUE`. - #' @param .drop Drop groups formed by factor levels that don't appear in the - #' data. The default is `TRUE` except when `.data` has been previously grouped - #' with `.drop = FALSE`. See [`group_by_drop_default()`] for details. - group_by = function(..., .add = FALSE, .drop = dplyr::group_by_drop_default(self)) { - group_by.epi_archive(self, ..., .add = .add, .drop = .drop) - }, - #' @description Slides a given function over variables in an `epi_archive` - #' object. See the documentation for the wrapper function [`epix_slide()`] for - #' details. The parameter descriptions below are copied from there - #' @importFrom data.table key - #' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms - #' @param f Function, formula, or missing; together with `...` specifies the - #' computation to slide. To "slide" means to apply a computation over a - #' sliding (a.k.a. "rolling") time window for each data group. The window is - #' determined by the `before` parameter described below. One time step is - #' typically one day or one week; see [`epi_slide`] details for more - #' explanation. If a function, `f` must take an `epi_df` with the same - #' column names as the archive's `DT`, minus the `version` column; followed - #' by a one-row tibble containing the values of the grouping variables for - #' the associated group; followed by a reference time value, usually as a - #' `Date` object; followed by any number of named arguments. If a formula, - #' `f` can operate directly on columns accessed via `.x$var` or `.$var`, as - #' in `~ mean (.x$var)` to compute a mean of a column `var` for each - #' group-`ref_time_value` combination. The group key can be accessed via - #' `.y` or `.group_key`, and the reference time value can be accessed via - #' `.z` or `.ref_time_value`. If `f` is missing, then `...` will specify the - #' computation. - #' @param ... Additional arguments to pass to the function or formula specified - #' via `f`. Alternatively, if `f` is missing, then `...` is interpreted as an - #' expression for tidy evaluation; in addition to referring to columns - #' directly by name, the expression has access to `.data` and `.env` pronouns - #' as in `dplyr` verbs, and can also refer to the `.group_key` and - #' `.ref_time_value`. See details of [`epi_slide`]. - #' @param before How far `before` each `ref_time_value` should the sliding - #' window extend? If provided, should be a single, non-NA, - #' [integer-compatible][vctrs::vec_cast] number of time steps. This window - #' endpoint is inclusive. For example, if `before = 7`, and one time step is - #' one day, then to produce a value for a `ref_time_value` of January 8, we - #' apply the given function or formula to data (for each group present) with - #' `time_value`s from January 1 onward, as they were reported on January 8. - #' For typical disease surveillance sources, this will not include any data - #' with a `time_value` of January 8, and, depending on the amount of reporting - #' latency, may not include January 7 or even earlier `time_value`s. (If - #' instead the archive were to hold nowcasts instead of regular surveillance - #' data, then we would indeed expect data for `time_value` January 8. If it - #' were to hold forecasts, then we would expect data for `time_value`s after - #' January 8, and the sliding window would extend as far after each - #' `ref_time_value` as needed to include all such `time_value`s.) - #' @param ref_time_values Reference time values / versions for sliding - #' computations; each element of this vector serves both as the anchor point - #' for the `time_value` window for the computation and the `max_version` - #' `as_of` which we fetch data in this window. If missing, then this will set - #' to a regularly-spaced sequence of values set to cover the range of - #' `version`s in the `DT` plus the `versions_end`; the spacing of values will - #' be guessed (using the GCD of the skips between values). - #' @param time_step Optional function used to define the meaning of one time - #' step, which if specified, overrides the default choice based on the - #' `time_value` column. This function must take a positive integer and return - #' an object of class `lubridate::period`. For example, we can use `time_step - #' = lubridate::hours` in order to set the time step to be one hour (this - #' would only be meaningful if `time_value` is of class `POSIXct`). - #' @param new_col_name String indicating the name of the new column that will - #' contain the derivative values. Default is "slide_value"; note that setting - #' `new_col_name` equal to an existing column name will overwrite this column. - #' @param as_list_col Should the slide results be held in a list column, or be - #' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, - #' in which case a list object returned by `f` would be unnested (using - #' [`tidyr::unnest()`]), and, if the slide computations output data frames, - #' the names of the resulting columns are given by prepending `new_col_name` - #' to the names of the list elements. - #' @param names_sep String specifying the separator to use in `tidyr::unnest()` - #' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix - #' from `new_col_name` entirely. - #' @param all_versions (Not the same as `all_rows` parameter of `epi_slide`.) If - #' `all_versions = TRUE`, then `f` will be passed the version history (all - #' `version <= ref_time_value`) for rows having `time_value` between - #' `ref_time_value - before` and `ref_time_value`. Otherwise, `f` will be - #' passed only the most recent `version` for every unique `time_value`. - #' Default is `FALSE`. - slide = function(f, ..., before, ref_time_values, - time_step, new_col_name = "slide_value", - as_list_col = FALSE, names_sep = "_", - all_versions = FALSE) { - # For an "ungrouped" slide, treat all rows as belonging to one big - # group (group by 0 vars), like `dplyr::summarize`, and let the - # resulting `grouped_epi_archive` handle the slide: - self$group_by()$slide( - f, ..., - before = before, ref_time_values = ref_time_values, - time_step = time_step, new_col_name = new_col_name, - as_list_col = as_list_col, names_sep = names_sep, - all_versions = all_versions - ) %>% - # We want a slide on ungrouped archives to output something - # ungrouped, rather than retaining the trivial (0-variable) - # grouping applied above. So we `ungroup()`. However, the current - # `dplyr` implementation automatically ignores/drops trivial - # groupings, so this is just a no-op for now. - ungroup() - } - ) - ) - -#' Convert to `epi_archive` format -#' -#' Converts a data frame, data table, or tibble into an `epi_archive` -#' object. See the [archive -#' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for -#' examples. The parameter descriptions below are copied from there -#' -#' @param x A data frame, data table, or tibble, with columns `geo_value`, +#' @param x A data.frame, data.table, or tibble, with columns `geo_value`, #' `time_value`, `version`, and then any additional number of columns. #' @param geo_type Type for the geo values. If missing, then the function will #' attempt to infer it from the geo values present; if this fails, then it @@ -781,19 +202,9 @@ epi_archive <- #' @param additional_metadata List of additional metadata to attach to the #' `epi_archive` object. The metadata will have `geo_type` and `time_type` #' fields; named entries from the passed list or will be included as well. -#' @param compactify Optional; Boolean or `NULL`: should we remove rows that are -#' considered redundant for the purposes of `epi_archive`'s built-in methods -#' such as `as_of`? As these methods use the last version of each observation -#' carried forward (LOCF) to interpolate between the version data provided, -#' rows that don't change these LOCF results can potentially be omitted to -#' save space. `TRUE` will remove these rows, `FALSE` will not, and missing or -#' `NULL` will remove these rows and issue a warning. Generally, this can be -#' set to `TRUE`, but if you directly inspect or edit the fields of the -#' `epi_archive` such as its `DT`, you will have to determine whether -#' `compactify=TRUE` will produce the desired results. If compactification -#' here is removing a large proportion of the rows, this may indicate a -#' potential for space, time, or bandwidth savings upstream the data pipeline, -#' e.g., when fetching, storing, or preparing the input data `x` +#' @param compactify Optional; Boolean or `NULL`. `TRUE` will remove some +#' redundant rows, `FALSE` will not, and missing or `NULL` will remove +#' redundant rows, but issue a warning. See more information at `compactify`. #' @param clobberable_versions_start Optional; `length`-1; either a value of the #' same `class` and `typeof` as `x$version`, or an `NA` of any `class` and #' `typeof`: specifically, either (a) the earliest version that could be @@ -820,17 +231,12 @@ epi_archive <- #' clobbered.) If `nrow(x) == 0`, then this argument is mandatory. #' @return An `epi_archive` object. #' -#' @details This simply a wrapper around the `new()` method of the `epi_archive` -#' class, so for example: -#' ``` -#' x <- as_epi_archive(df, geo_type = "state", time_type = "day") -#' ``` -#' would be equivalent to: -#' ``` -#' x <- epi_archive$new(df, geo_type = "state", time_type = "day") -#' ``` +#' @importFrom data.table as.data.table key setkeyv +#' @importFrom dplyr if_any if_all everything #' +#' @name epi_archive #' @export +#' #' @examples #' # Simple ex. with necessary keys #' tib <- tibble::tibble( @@ -875,17 +281,377 @@ epi_archive <- #' time_type = "day", #' other_keys = "county" #' ) -as_epi_archive <- function(x, geo_type, time_type, other_keys, +#' +new_epi_archive <- function( + x, + geo_type = NULL, + time_type = NULL, + other_keys = NULL, + additional_metadata = NULL, + compactify = NULL, + clobberable_versions_start = NA, + versions_end = NULL) { + assert_data_frame(x) + if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { + cli_abort( + "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + ) + } + if (anyMissing(x$version)) { + cli_abort("Column `version` must not contain missing values.") + } + + # If geo type is missing, then try to guess it + if (is.null(geo_type)) { + geo_type <- guess_geo_type(x$geo_value) + } + + # If time type is missing, then try to guess it + if (missing(time_type) || is.null(time_type)) { + time_type <- guess_time_type(x$time_value) + } + + # Finish off with small checks on keys variables and metadata + if (missing(other_keys)) other_keys <- NULL + if (missing(additional_metadata) || is.null(additional_metadata)) additional_metadata <- list() + if (!test_subset(other_keys, names(x))) { + cli_abort("`other_keys` must be contained in the column names of `x`.") + } + if (any(c("geo_value", "time_value", "version") %in% other_keys)) { + cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") + } + if (any(names(additional_metadata) %in% c("geo_type", "time_type"))) { + cli_warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") + } + + # Conduct checks and apply defaults for `compactify` + if (missing(compactify)) { + compactify <- NULL + } + assert_logical(compactify, len = 1, null.ok = TRUE) + + # Apply defaults and conduct checks for + # `clobberable_versions_start`, `versions_end`: + if (missing(clobberable_versions_start)) { + clobberable_versions_start <- NA + } + if (missing(versions_end) || is.null(versions_end)) { + versions_end <- max_version_with_row_in(x) + } + validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) + validate_version_bound(versions_end, x, na_ok = FALSE) + if (nrow(x) > 0L && versions_end < max(x[["version"]])) { + cli_abort( + sprintf( + "`versions_end` was %s, but `x` contained + updates for a later version or versions, up through %s", + versions_end, max(x[["version"]]) + ), + class = "epiprocess__versions_end_earlier_than_updates" + ) + } + if (!is.na(clobberable_versions_start) && clobberable_versions_start > versions_end) { + cli_abort( + sprintf( + "`versions_end` was %s, but a `clobberable_versions_start` + of %s indicated that there were later observed versions", + versions_end, clobberable_versions_start + ), + class = "epiprocess__versions_end_earlier_than_clobberable_versions_start" + ) + } + + # --- End of validation and replacing missing args with defaults --- + + # Create the data table; if x was an un-keyed data.table itself, + # then the call to as.data.table() will fail to set keys, so we + # need to check this, then do it manually if needed + key_vars <- c("geo_value", "time_value", other_keys, "version") + DT <- as.data.table(x, key = key_vars) # nolint: object_name_linter + if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) + + maybe_first_duplicate_key_row_index <- anyDuplicated(DT, by = key(DT)) + if (maybe_first_duplicate_key_row_index != 0L) { + cli_abort("`x` must have one row per unique combination of the key variables. If you + have additional key variables other than `geo_value`, `time_value`, and + `version`, such as an age group column, please specify them in `other_keys`. + Otherwise, check for duplicate rows and/or conflicting values for the same + measurement.", + class = "epiprocess__epi_archive_requires_unique_key" + ) + } + + # Checks to see if a value in a vector is LOCF + is_locf <- function(vec) { # nolint: object_usage_linter + dplyr::if_else(!is.na(vec) & !is.na(dplyr::lag(vec)), + vec == dplyr::lag(vec), + is.na(vec) & is.na(dplyr::lag(vec)) + ) + } + + # LOCF is defined by a row where all values except for the version + # differ from their respective lag values + + # Checks for LOCF's in a data frame + rm_locf <- function(df) { + dplyr::filter(df, if_any(c(everything(), -version), ~ !is_locf(.))) # nolint: object_usage_linter + } + + # Keeps LOCF values, such as to be printed + keep_locf <- function(df) { + dplyr::filter(df, if_all(c(everything(), -version), ~ is_locf(.))) # nolint: object_usage_linter + } + + # Runs compactify on data frame + if (is.null(compactify) || compactify == TRUE) { + elim <- keep_locf(DT) + DT <- rm_locf(DT) # nolint: object_name_linter + } else { + # Create empty data frame for nrow(elim) to be 0 + elim <- tibble::tibble() + } + + # Warns about redundant rows + if (is.null(compactify) && nrow(elim) > 0) { + warning_intro <- cli::format_inline( + "Found rows that appear redundant based on + last (version of each) observation carried forward; + these rows have been removed to 'compactify' and save space:", + keep_whitespace = FALSE + ) + warning_data <- paste(collapse = "\n", capture.output(print(elim, topn = 3L, nrows = 7L))) + warning_outro <- cli::format_inline( + "Built-in `epi_archive` functionality should be unaffected, + but results may change if you work directly with its fields (such as `DT`). + See `?as_epi_archive` for details. + To silence this warning but keep compactification, + you can pass `compactify=TRUE` when constructing the archive.", + keep_whitespace = FALSE + ) + warning_message <- paste(sep = "\n", warning_intro, warning_data, warning_outro) + rlang::warn(warning_message, class = "epiprocess__compactify_default_removed_rows") + } + + structure( + list( + DT = DT, + geo_type = geo_type, + time_type = time_type, + additional_metadata = additional_metadata, + clobberable_versions_start = clobberable_versions_start, + versions_end = versions_end + ), + class = "epi_archive" + ) +} + + +#' `as_epi_archive` converts a data frame, data table, or tibble into an +#' `epi_archive` object. +#' +#' @rdname epi_archive +#' +#' @export +as_epi_archive <- function(x, geo_type = NULL, time_type = NULL, other_keys = NULL, additional_metadata = list(), compactify = NULL, clobberable_versions_start = NA, versions_end = max_version_with_row_in(x)) { - epi_archive$new( + new_epi_archive( x, geo_type, time_type, other_keys, additional_metadata, compactify, clobberable_versions_start, versions_end ) } + +#' Print information about an `epi_archive` object +#' +#' @param x An `epi_archive` object. +#' @param ... Should be empty, there to satisfy the S3 generic. +#' @param class Boolean; whether to print the class label header +#' @param methods Boolean; whether to print all available methods of +#' the archive +#' +#' @importFrom cli cli_inform +#' @importFrom rlang check_dots_empty +#' @export +print.epi_archive <- function(x, ..., class = TRUE, methods = TRUE) { + if (rlang::dots_n(...) > 0) { + cli_abort(c( + "Error in print.epi_archive()", + "i" = "Too many arguments passed to `print.epi_archive()`." + )) + } + + cli_inform( + c( + ">" = if (class) "An `epi_archive` object, with metadata:", + "i" = if (length(setdiff(key(x$DT), c("geo_value", "time_value", "version"))) > 0) { + "Non-standard DT keys: {setdiff(key(x$DT), c('geo_value', 'time_value', 'version'))}" + }, + "i" = "Min/max time values: {min(x$DT$time_value)} / {max(x$DT$time_value)}", + "i" = "First/last version with update: {min(x$DT$version)} / {max(x$DT$version)}", + "i" = if (!is.na(x$clobberable_versions_start)) { + "Clobberable versions start: {x$clobberable_versions_start}" + }, + "i" = "Versions end: {x$versions_end}", + "i" = "A preview of the table ({nrow(x$DT)} rows x {ncol(x$DT)} columns):" + ) + ) + + print(x$DT[]) + return(invisible(x)) +} + + +#' `group_by` and related methods for `epi_archive`, `grouped_epi_archive` +#' +#' @param .data An `epi_archive` or `grouped_epi_archive` +#' @param ... Similar to [`dplyr::group_by`] (see "Details:" for edge cases); +#' * For `group_by`: unquoted variable name(s) or other +#' ["data masking"][dplyr::dplyr_data_masking] expression(s). It's possible to +#' use [`dplyr::mutate`]-like syntax here to calculate new columns on which to +#' perform grouping, but note that, if you are regrouping an already-grouped +#' `.data` object, the calculations will be carried out ignoring such grouping +#' (same as [in dplyr][dplyr::group_by]). +#' * For `ungroup`: either +#' * empty, in order to remove the grouping and output an `epi_archive`; or +#' * variable name(s) or other ["tidy-select"][dplyr::dplyr_tidy_select] +#' expression(s), in order to remove the matching variables from the list of +#' grouping variables, and output another `grouped_epi_archive`. +#' @param .add Boolean. If `FALSE`, the default, the output will be grouped by +#' the variable selection from `...` only; if `TRUE`, the output will be +#' grouped by the current grouping variables plus the variable selection from +#' `...`. +#' @param .drop As described in [`dplyr::group_by`]; determines treatment of +#' factor columns. +#' @param x For `groups` or `ungroup`: a `grouped_epi_archive`; for +#' `is_grouped_epi_archive`: any object +#' @param .tbl (For `group_by_drop_default`:) an `epi_archive` or +#' `grouped_epi_archive` (`epi_archive` dispatches to the S3 default method; +#' `grouped_epi_archive` dispatches its own S3 method) +#' +#' @details +#' +#' To match `dplyr`, `group_by` allows "data masking" (also referred to as +#' "tidy evaluation") expressions `...`, not just column names, in a way similar +#' to `mutate`. Note that replacing or removing key columns with these +#' expressions is disabled. +#' +#' `archive %>% group_by()` and other expressions that group or regroup by zero +#' columns (indicating that all rows should be treated as part of one large +#' group) will output a `grouped_epi_archive`, in order to enable the use of +#' `grouped_epi_archive` methods on the result. This is in slight contrast to +#' the same operations on tibbles and grouped tibbles, which will *not* output a +#' `grouped_df` in these circumstances. +#' +#' Using `group_by` with `.add=FALSE` to override the existing grouping is +#' disabled; instead, `ungroup` first then `group_by`. +#' +#' `group_by_drop_default` on (ungrouped) `epi_archive`s is expected to dispatch +#' to `group_by_drop_default.default` (but there is a dedicated method for +#' `grouped_epi_archive`s). +#' +#' @examples +#' +#' grouped_archive <- archive_cases_dv_subset %>% group_by(geo_value) +#' +#' # `print` for metadata and method listing: +#' grouped_archive %>% print() +#' +#' # The primary use for grouping is to perform a grouped `epix_slide`: +#' +#' archive_cases_dv_subset %>% +#' group_by(geo_value) %>% +#' epix_slide( +#' f = ~ mean(.x$case_rate_7d_av), +#' before = 2, +#' ref_time_values = as.Date("2020-06-11") + 0:2, +#' new_col_name = "case_rate_3d_av" +#' ) %>% +#' ungroup() +#' +#' # ----------------------------------------------------------------- +#' +#' # Advanced: some other features of dplyr grouping are implemented: +#' +#' library(dplyr) +#' toy_archive <- +#' tribble( +#' ~geo_value, ~age_group, ~time_value, ~version, ~value, +#' "us", "adult", "2000-01-01", "2000-01-02", 121, +#' "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) +#' "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) +#' "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) +#' ) %>% +#' mutate( +#' age_group = ordered(age_group, c("pediatric", "adult")), +#' time_value = as.Date(time_value), +#' version = as.Date(version) +#' ) %>% +#' as_epi_archive(other_keys = "age_group") +#' +#' # The following are equivalent: +#' toy_archive %>% group_by(geo_value, age_group) +#' toy_archive %>% +#' group_by(geo_value) %>% +#' group_by(age_group, .add = TRUE) +#' grouping_cols <- c("geo_value", "age_group") +#' toy_archive %>% group_by(across(all_of(grouping_cols))) +#' +#' # And these are equivalent: +#' toy_archive %>% group_by(geo_value) +#' toy_archive %>% +#' group_by(geo_value, age_group) %>% +#' ungroup(age_group) +#' +#' # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): +#' toy_archive %>% +#' group_by(geo_value) %>% +#' groups() +#' +#' toy_archive %>% +#' group_by(geo_value, age_group, .drop = FALSE) %>% +#' epix_slide(f = ~ sum(.x$value), before = 20) %>% +#' ungroup() +#' +#' @importFrom dplyr group_by +#' @export +#' +#' @aliases grouped_epi_archive +group_by.epi_archive <- function(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) { + # `add` makes no difference; this is an ungrouped `epi_archive`. + detailed_mutate <- epix_detailed_restricted_mutate(.data, ...) + assert_logical(.drop) + if (!.drop) { + grouping_cols <- as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]] + grouping_col_is_factor <- purrr::map_lgl(grouping_cols, is.factor) + # ^ Use `as.list` to try to avoid any possibility of a deep copy. + if (!any(grouping_col_is_factor)) { + cli_warn( + "`.drop=FALSE` but there are no factor grouping columns; + did you mean to convert one of the columns to a factor beforehand?", + class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" + ) + } else if (any(diff(grouping_col_is_factor) == -1L)) { + cli_warn( + "`.drop=FALSE` but there are one or more non-factor grouping columns listed + after a factor grouping column; this may produce groups with `NA`s for these + columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; + depending on how you want completion to work, you might instead want to convert all + grouping columns to factors beforehand, specify the non-factor grouping columns first, + or use `.drop=TRUE` and add a call to `tidyr::complete`.", + class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor" + ) + } + } + new_grouped_epi_archive(detailed_mutate[["archive"]], + detailed_mutate[["request_names"]], + drop = .drop + ) +} + + #' Test for `epi_archive` format #' #' @param x An object. @@ -901,7 +667,7 @@ as_epi_archive <- function(x, geo_type, time_type, other_keys, #' # By default, grouped_epi_archives don't count as epi_archives, as they may #' # support a different set of operations from regular `epi_archives`. This #' # behavior can be controlled by `grouped_okay`. -#' grouped_archive <- archive_cases_dv_subset$group_by(geo_value) +#' grouped_archive <- archive_cases_dv_subset %>% group_by(geo_value) #' is_epi_archive(grouped_archive) # FALSE #' is_epi_archive(grouped_archive, grouped_okay = TRUE) # TRUE #' @@ -909,3 +675,22 @@ as_epi_archive <- function(x, geo_type, time_type, other_keys, is_epi_archive <- function(x, grouped_okay = FALSE) { inherits(x, "epi_archive") || grouped_okay && inherits(x, "grouped_epi_archive") } + + +#' Clone an `epi_archive` object. +#' +#' @param x An `epi_archive` object. +#' +#' @importFrom data.table copy +#' @export +clone <- function(x) { + UseMethod("clone") +} + + +#' @rdname clone +#' @export +clone.epi_archive <- function(x) { + x$DT <- data.table::copy(x$DT) + return(x) +} diff --git a/R/archive_new.R b/R/archive_new.R deleted file mode 100644 index 0b4f3695..00000000 --- a/R/archive_new.R +++ /dev/null @@ -1,1115 +0,0 @@ -# We use special features of data.table's `[`. The data.table package has a -# compatibility feature that disables some/all of these features if it thinks we -# might expect `data.frame`-compatible behavior instead. We can signal that we -# want the special behavior via `.datatable.aware = TRUE` or by importing any -# `data.table` package member. Do both to prevent surprises if we decide to use -# `data.table::` everywhere and not importing things. -.datatable.aware <- TRUE - -#' Validate a version bound arg -#' -#' Expected to be used on `clobberable_versions_start`, `versions_end`, -#' and similar arguments. Some additional context-specific checks may be needed. -#' -#' @param version_bound the version bound to validate -#' @param x a data frame containing a version column with which to check -#' compatibility -#' @param na_ok Boolean; is `NA` an acceptable "bound"? (If so, `NA` will -#' have a special context-dependent meaning.) -#' @param version_bound_arg optional string; what to call the version bound in -#' error messages -#' -#' @section Side effects: raises an error if version bound appears invalid -#' -#' @noRd -validate_version_bound <- function(version_bound, x, na_ok = FALSE, - version_bound_arg = rlang::caller_arg(version_bound), - x_arg = rlang::caller_arg(version_bound)) { - if (is.null(version_bound)) { - cli_abort( - "{version_bound_arg} cannot be NULL" - ) - } - if (na_ok && is.na(version_bound)) { - return(invisible(NULL)) - } - if (!test_set_equal(class(version_bound), class(x[["version"]]))) { - cli_abort( - "{version_bound_arg} must have the same classes as x$version, - which is {class(x$version)}", - ) - } - if (!test_set_equal(typeof(version_bound), typeof(x[["version"]]))) { - cli_abort( - "{version_bound_arg} must have the same types as x$version, - which is {typeof(x$version)}", - ) - } - - return(invisible(NULL)) -} - -#' `max(x$version)`, with error if `x` has 0 rows -#' -#' Exported to make defaults more easily copyable. -#' -#' @param x `x` argument of [`as_epi_archive`] -#' -#' @return `max(x$version)` if it has any rows; raises error if it has 0 rows or -#' an `NA` version value -#' -#' @export -max_version_with_row_in <- function(x) { - if (nrow(x) == 0L) { - cli_abort( - "`nrow(x)==0L`, representing a data set history with no row up through the - latest observed version, but we don't have a sensible guess at what version - that is, or whether any of the empty versions might be clobbered in the - future; if we use `x` to form an `epi_archive`, then - `clobberable_versions_start` and `versions_end` must be manually specified.", - class = "epiprocess__max_version_cannot_be_used" - ) - } else { - version_col <- purrr::pluck(x, "version") # error not NULL if doesn't exist - if (anyNA(version_col)) { - cli_abort("version values cannot be NA", - class = "epiprocess__version_values_must_not_be_na" - ) - } else { - version_bound <- max(version_col) - } - } -} - -#' Get the next possible value greater than `x` of the same type -#' -#' @param x the starting "value"(s) -#' @return same class, typeof, and length as `x` -#' -#' @export -next_after <- function(x) UseMethod("next_after") - -#' @export -next_after.integer <- function(x) x + 1L - -#' @export -next_after.Date <- function(x) x + 1L - - - -#' epi archive -#' @title `epi_archive` object -#' -#' @description An `epi_archive` is an R6 class which contains a data table -#' along with several relevant pieces of metadata. The data table can be seen -#' as the full archive (version history) for some signal variables of -#' interest. -#' -#' @details An `epi_archive` is an R6 class which contains a data table `DT`, of -#' class `data.table` from the `data.table` package, with (at least) the -#' following columns: -#' -#' * `geo_value`: the geographic value associated with each row of measurements. -#' * `time_value`: the time value associated with each row of measurements. -#' * `version`: the time value specifying the version for each row of -#' measurements. For example, if in a given row the `version` is January 15, -#' 2022 and `time_value` is January 14, 2022, then this row contains the -#' measurements of the data for January 14, 2022 that were available one day -#' later. -#' -#' The data table `DT` has key variables `geo_value`, `time_value`, `version`, -#' as well as any others (these can be specified when instantiating the -#' `epi_archive` object via the `other_keys` argument, and/or set by operating -#' on `DT` directly). Refer to the documentation for [as_epi_archive()] for -#' information and examples of relevant parameter names for an `epi_archive` object. -#' Note that there can only be a single row per unique combination of -#' key variables, and thus the key variables are critical for figuring out how -#' to generate a snapshot of data from the archive, as of a given version. -#' -#' In general, the last version of each observation is carried forward (LOCF) to -#' fill in data between recorded versions, and between the last recorded -#' update and the `versions_end`. One consequence is that the `DT` -#' doesn't have to contain a full snapshot of every version (although this -#' generally works), but can instead contain only the rows that are new or -#' changed from the previous version (see `compactify`, which does this -#' automatically). Currently, deletions must be represented as revising a row -#' to a special state (e.g., making the entries `NA` or including a special -#' column that flags the data as removed and performing some kind of -#' post-processing), and the archive is unaware of what this state is. Note -#' that `NA`s *can* be introduced by `epi_archive` methods for other reasons, -#' e.g., in [`epix_fill_through_version`] and [`epix_merge`], if requested, to -#' represent potential update data that we do not yet have access to; or in -#' [`epix_merge`] to represent the "value" of an observation before the -#' version in which it was first released, or if no version of that -#' observation appears in the archive data at all. -#' -#' **A word of caution:** R6 objects, unlike most other objects in R, have -#' reference semantics. A primary consequence of this is that objects are not -#' copied when modified. You can read more about this in Hadley Wickham's -#' [Advanced R](https://adv-r.hadley.nz/r6.html#r6-semantics) book. In order -#' to construct a modified archive while keeping the original intact, first -#' make a clone using the `$clone` method, then overwrite the clone's `DT` -#' field with `data.table::copy(clone$DT)`, and finally perform the -#' modifications on the clone. -#' -#' @section Metadata: -#' The following pieces of metadata are included as fields in an `epi_archive` -#' object: -#' -#' * `geo_type`: the type for the geo values. -#' * `time_type`: the type for the time values. -#' * `additional_metadata`: list of additional metadata for the data archive. -#' -#' Unlike an `epi_df` object, metadata for an `epi_archive` object `x` can be -#' accessed (and altered) directly, as in `x$geo_type` or `x$time_type`, -#' etc. Like an `epi_df` object, the `geo_type` and `time_type` fields in the -#' metadata of an `epi_archive` object are not currently used by any -#' downstream functions in the `epiprocess` package, and serve only as useful -#' bits of information to convey about the data set at hand. -#' -#' @section Generating Snapshots: -#' An `epi_archive` object can be used to generate a snapshot of the data in -#' `epi_df` format, which represents the most up-to-date values of the signal -#' variables, as of the specified version. This is accomplished by calling the -#' `as_of()` method for an `epi_archive` object `x`. More details on this -#' method are documented in the wrapper function [`epix_as_of()`]. -#' -#' @section Sliding Computations: -#' We can run a sliding computation over an `epi_archive` object, much like -#' `epi_slide()` does for an `epi_df` object. This is accomplished by calling -#' the `slide()` method for an `epi_archive` object, which works similarly to -#' the way `epi_slide()` works for an `epi_df` object, but with one key -#' difference: it is version-aware. That is, for an `epi_archive` object, the -#' sliding computation at any given reference time point t is performed on -#' **data that would have been available as of t**. More details on `slide()` -#' are documented in the wrapper function [`epix_slide()`]. -#' -#' @export -#' @examples -#' tib <- tibble::tibble( -#' geo_value = rep(c("ca", "hi"), each = 5), -#' time_value = rep(seq(as.Date("2020-01-01"), -#' by = 1, length.out = 5 -#' ), times = 2), -#' version = rep(seq(as.Date("2020-01-02"), -#' by = 1, length.out = 5 -#' ), times = 2), -#' value = rnorm(10, mean = 2, sd = 1) -#' ) -#' -#' toy_epi_archive <- tib %>% new_epi_archive2( -#' geo_type = "state", -#' time_type = "day" -#' ) -#' toy_epi_archive -#' @name epi_archive -# TODO: Figure out where to actually put this documentation -NULL - -#' New epi archive -#' @description Creates a new `epi_archive` object. -#' @param x A data.frame, data.table, or tibble, with columns `geo_value`, -#' `time_value`, `version`, and then any additional number of columns. -#' @param geo_type Type for the geo values. If missing, then the function will -#' attempt to infer it from the geo values present; if this fails, then it -#' will be set to "custom". -#' @param time_type Type for the time values. If missing, then the function will -#' attempt to infer it from the time values present; if this fails, then it -#' will be set to "custom". -#' @param other_keys Character vector specifying the names of variables in `x` -#' that should be considered key variables (in the language of `data.table`) -#' apart from "geo_value", "time_value", and "version". -#' @param additional_metadata List of additional metadata to attach to the -#' `epi_archive` object. The metadata will have `geo_type` and `time_type` -#' fields; named entries from the passed list or will be included as well. -#' @param compactify Optional; Boolean or `NULL`: should we remove rows that are -#' considered redundant for the purposes of `epi_archive`'s built-in methods -#' such as `as_of`? As these methods use the last version of each observation -#' carried forward (LOCF) to interpolate between the version data provided, -#' rows that don't change these LOCF results can potentially be omitted to -#' save space while maintaining the same behavior (with the help of the -#' `clobberable_versions_start` and `versions_end` fields in some edge cases). -#' `TRUE` will remove these rows, `FALSE` will not, and missing or `NULL` will -#' remove these rows and issue a warning. Generally, this can be set to -#' `TRUE`, but if you directly inspect or edit the fields of the `epi_archive` -#' such as its `DT`, or rely on redundant updates to achieve a certain -#' behavior of the `ref_time_values` default in `epix_slide`, you will have to -#' determine whether `compactify=TRUE` will produce the desired results. If -#' compactification here is removing a large proportion of the rows, this may -#' indicate a potential for space, time, or bandwidth savings upstream the -#' data pipeline, e.g., by avoiding fetching, storing, or processing these -#' rows of `x`. -#' @param clobberable_versions_start Optional; as in [`as_epi_archive`] -#' @param versions_end Optional; as in [`as_epi_archive`] -#' @return An `epi_archive` object. -#' @importFrom data.table as.data.table key setkeyv -#' -#' @details -#' Refer to the documentation for [as_epi_archive()] for more information -#' and examples of parameter names. -#' @export -new_epi_archive2 <- function( - x, - geo_type = NULL, - time_type = NULL, - other_keys = NULL, - additional_metadata = NULL, - compactify = NULL, - clobberable_versions_start = NA, - versions_end = NULL) { - assert_data_frame(x) - if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { - cli_abort( - "Columns `geo_value`, `time_value`, and `version` must be present in `x`." - ) - } - if (anyMissing(x$version)) { - cli_abort("Column `version` must not contain missing values.") - } - - # If geo type is missing, then try to guess it - if (missing(geo_type) || is.null(geo_type)) { - geo_type <- guess_geo_type(x$geo_value) - } - - # If time type is missing, then try to guess it - if (missing(time_type) || is.null(time_type)) { - time_type <- guess_time_type(x$time_value) - } - - # Finish off with small checks on keys variables and metadata - if (missing(other_keys)) other_keys <- NULL - if (missing(additional_metadata) || is.null(additional_metadata)) additional_metadata <- list() - if (!test_subset(other_keys, names(x))) { - cli_abort("`other_keys` must be contained in the column names of `x`.") - } - if (any(c("geo_value", "time_value", "version") %in% other_keys)) { - cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") - } - if (any(names(additional_metadata) %in% c("geo_type", "time_type"))) { - cli_warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") - } - - # Conduct checks and apply defaults for `compactify` - if (missing(compactify)) { - compactify <- NULL - } - assert_logical(compactify, len = 1, null.ok = TRUE) - - # Apply defaults and conduct checks for - # `clobberable_versions_start`, `versions_end`: - if (missing(clobberable_versions_start)) { - clobberable_versions_start <- NA - } - if (missing(versions_end) || is.null(versions_end)) { - versions_end <- max_version_with_row_in(x) - } - validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) - validate_version_bound(versions_end, x, na_ok = FALSE) - if (nrow(x) > 0L && versions_end < max(x[["version"]])) { - cli_abort( - sprintf( - "`versions_end` was %s, but `x` contained - updates for a later version or versions, up through %s", - versions_end, max(x[["version"]]) - ), - class = "epiprocess__versions_end_earlier_than_updates" - ) - } - if (!is.na(clobberable_versions_start) && clobberable_versions_start > versions_end) { - cli_abort( - sprintf( - "`versions_end` was %s, but a `clobberable_versions_start` - of %s indicated that there were later observed versions", - versions_end, clobberable_versions_start - ), - class = "epiprocess__versions_end_earlier_than_clobberable_versions_start" - ) - } - - # --- End of validation and replacing missing args with defaults --- - - # Create the data table; if x was an un-keyed data.table itself, - # then the call to as.data.table() will fail to set keys, so we - # need to check this, then do it manually if needed - key_vars <- c("geo_value", "time_value", other_keys, "version") - DT <- as.data.table(x, key = key_vars) - if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) - - maybe_first_duplicate_key_row_index <- anyDuplicated(DT, by = key(DT)) - if (maybe_first_duplicate_key_row_index != 0L) { - cli_abort("`x` must have one row per unique combination of the key variables. If you - have additional key variables other than `geo_value`, `time_value`, and - `version`, such as an age group column, please specify them in `other_keys`. - Otherwise, check for duplicate rows and/or conflicting values for the same - measurement.", - class = "epiprocess__epi_archive_requires_unique_key" - ) - } - - # Checks to see if a value in a vector is LOCF - is_locf <- function(vec) { - dplyr::if_else(!is.na(vec) & !is.na(dplyr::lag(vec)), - vec == dplyr::lag(vec), - is.na(vec) & is.na(dplyr::lag(vec)) - ) - } - - # LOCF is defined by a row where all values except for the version - # differ from their respective lag values - - # Checks for LOCF's in a data frame - rm_locf <- function(df) { - dplyr::filter(df, if_any(c(everything(), -version), ~ !is_locf(.))) - } - - # Keeps LOCF values, such as to be printed - keep_locf <- function(df) { - dplyr::filter(df, if_all(c(everything(), -version), ~ is_locf(.))) - } - - # Runs compactify on data frame - if (is.null(compactify) || compactify == TRUE) { - elim <- keep_locf(DT) - DT <- rm_locf(DT) - } else { - # Create empty data frame for nrow(elim) to be 0 - elim <- tibble::tibble() - } - - # Warns about redundant rows - if (is.null(compactify) && nrow(elim) > 0) { - warning_intro <- cli::format_inline( - "Found rows that appear redundant based on - last (version of each) observation carried forward; - these rows have been removed to 'compactify' and save space:", - keep_whitespace = FALSE - ) - warning_data <- paste(collapse = "\n", capture.output(print(elim, topn = 3L, nrows = 7L))) - warning_outro <- cli::format_inline( - "Built-in `epi_archive` functionality should be unaffected, - but results may change if you work directly with its fields (such as `DT`). - See `?as_epi_archive` for details. - To silence this warning but keep compactification, - you can pass `compactify=TRUE` when constructing the archive.", - keep_whitespace = FALSE - ) - warning_message <- paste(sep = "\n", warning_intro, warning_data, warning_outro) - rlang::warn(warning_message, class = "epiprocess__compactify_default_removed_rows") - } - - structure( - list( - DT = DT, - geo_type = geo_type, - time_type = time_type, - additional_metadata = additional_metadata, - clobberable_versions_start = clobberable_versions_start, - versions_end = versions_end, - private = list() # TODO: to be encapsulated with guard-rails later - ), - class = "epi_archive2" - ) -} - -#' Print information about an `epi_archive` object -#' @param class Boolean; whether to print the class label header -#' @param methods Boolean; whether to print all available methods of -#' the archive -#' @importFrom cli cli_inform -#' @export -print.epi_archive2 <- function(epi_archive, class = TRUE, methods = TRUE) { - cli_inform( - c( - ">" = if (class) "An `epi_archive` object, with metadata:", - "i" = if (length(setdiff(key(epi_archive$DT), c("geo_value", "time_value", "version"))) > 0) { - "Non-standard DT keys: {setdiff(key(epi_archive$DT), c('geo_value', 'time_value', 'version'))}" - }, - "i" = "Min/max time values: {min(epi_archive$DT$time_value)} / {max(epi_archive$DT$time_value)}", - "i" = "First/last version with update: {min(epi_archive$DT$version)} / {max(epi_archive$DT$version)}", - "i" = if (!is.na(epi_archive$clobberable_versions_start)) { - "Clobberable versions start: {epi_archive$clobberable_versions_start}" - }, - "i" = "Versions end: {epi_archive$versions_end}", - "i" = if (methods) "Public R6 methods: {names(epi_archive$public_methods)}", - "i" = "A preview of the table ({nrow(epi_archive$DT)} rows x {ncol(epi_archive$DT)} columns):" - ) - ) - - return(invisible(epi_archive$DT %>% print())) -} - - -#' @export -as_of <- function(x, ...) { - UseMethod("as_of") -} - - -#' As of epi_archive -#' @description Generates a snapshot in `epi_df` format as of a given version. -#' See the documentation for the wrapper function [`epix_as_of()`] for -#' details. The parameter descriptions below are copied from there -#' @param epi_archive An `epi_archive` object -#' @param max_version Version specifying the max version to permit in the -#' snapshot. That is, the snapshot will comprise the unique rows of the -#' current archive data that represent the most up-to-date signal values, as -#' of the specified `max_version` (and whose `time_value`s are at least -#' `min_time_value`). -#' @param min_time_value Time value specifying the min `time_value` to permit in -#' the snapshot. Default is `-Inf`, which effectively means that there is no -#' minimum considered. -#' @param all_versions Boolean; If `all_versions = TRUE`, then the output will be in -#' `epi_archive` format, and contain rows in the specified `time_value` range -#' having `version <= max_version`. The resulting object will cover a -#' potentially narrower `version` and `time_value` range than `x`, depending -#' on user-provided arguments. Otherwise, there will be one row in the output -#' for the `max_version` of each `time_value`. Default is `FALSE`. -#' @importFrom data.table between key -#' @export -as_of.epi_archive2 <- function(epi_archive, max_version, min_time_value = -Inf, all_versions = FALSE) { - other_keys <- setdiff( - key(epi_archive$DT), - c("geo_value", "time_value", "version") - ) - if (length(other_keys) == 0) other_keys <- NULL - - # Check a few things on max_version - if (!test_set_equal(class(max_version), class(epi_archive$DT$version))) { - cli_abort( - "`max_version` must have the same classes as `epi_archive$DT$version`." - ) - } - if (!test_set_equal(typeof(max_version), typeof(epi_archive$DT$version))) { - cli_abort( - "`max_version` must have the same types as `epi_archive$DT$version`." - ) - } - assert_scalar(max_version, na.ok = FALSE) - if (max_version > epi_archive$versions_end) { - cli_abort("`max_version` must be at most `epi_archive$versions_end`.") - } - assert_logical(all_versions, len = 1) - if (!is.na(epi_archive$clobberable_versions_start) && max_version >= epi_archive$clobberable_versions_start) { - cli_warn( - 'Getting data as of some recent version which could still be - overwritten (under routine circumstances) without assigning a new - version number (a.k.a. "clobbered"). Thus, the snapshot that we - produce here should not be expected to be reproducible later. See - `?epi_archive` for more info and `?epix_as_of` on how to muffle.', - class = "epiprocess__snapshot_as_of_clobberable_version" - ) - } - - # Filter by version and return - if (all_versions) { - # epi_archive is copied into result, so we can modify result directly - result <- epix_truncate_versions_after(epi_archive, max_version) - result$DT <- result$DT[time_value >= min_time_value, ] - return(result) - } - - # Make sure to use data.table ways of filtering and selecting - as_of_epi_df <- epi_archive$DT[time_value >= min_time_value & version <= max_version, ] %>% - unique( - by = c("geo_value", "time_value", other_keys), - fromLast = TRUE - ) %>% - tibble::as_tibble() %>% - dplyr::select(-"version") %>% - as_epi_df( - geo_type = epi_archive$geo_type, - time_type = epi_archive$time_type, - as_of = max_version, - additional_metadata = c(epi_archive$additional_metadata, - other_keys = other_keys - ) - ) - - return(as_of_epi_df) -} - - -#' @export -fill_through_version <- function(x, ...) { - UseMethod("fill_through_version") -} - - -#' Fill through version -#' @description Fill in unobserved history using requested scheme by mutating -#' the given object and potentially reseating its fields. See -#' [`epix_fill_through_version`], which doesn't mutate the input archive but -#' might alias its fields. -#' -#' @param epi_archive an `epi_archive` object -#' @param fill_versions_end as in [`epix_fill_through_version`] -#' @param how as in [`epix_fill_through_version`] -#' -#' @importFrom data.table key setkeyv := address copy -#' @importFrom rlang arg_match -fill_through_version.epi_archive2 <- function( - epi_archive, - fill_versions_end, - how = c("na", "locf")) { - validate_version_bound(fill_versions_end, epi_archive$DT, na_ok = FALSE) - how <- arg_match(how) - if (epi_archive$versions_end < fill_versions_end) { - new_DT <- switch(how, - "na" = { - # old DT + a version consisting of all NA observations - # immediately after the last currently/actually-observed - # version. Note that this NA-observation version must only be - # added if `epi_archive` is outdated. - nonversion_key_cols <- setdiff(key(epi_archive$DT), "version") - nonkey_cols <- setdiff(names(epi_archive$DT), key(epi_archive$DT)) - next_version_tag <- next_after(epi_archive$versions_end) - if (next_version_tag > fill_versions_end) { - cli_abort(sprintf(paste( - "Apparent problem with `next_after` method:", - "archive contained observations through version %s", - "and the next possible version was supposed to be %s,", - "but this appeared to jump from a version < %3$s", - "to one > %3$s, implying at least one version in between." - ), epi_archive$versions_end, next_version_tag, fill_versions_end)) - } - nonversion_key_vals_ever_recorded <- unique(epi_archive$DT, by = nonversion_key_cols) - # In edge cases, the `unique` result can alias the original - # DT; detect and copy if necessary: - if (identical(address(epi_archive$DT), address(nonversion_key_vals_ever_recorded))) { - nonversion_key_vals_ever_recorded <- copy(nonversion_key_vals_ever_recorded) - } - next_version_DT <- nonversion_key_vals_ever_recorded[ - , version := next_version_tag - ][ - # this makes the class of these columns logical (`NA` is a - # logical NA; we're relying on the rbind below to convert to - # the proper class&typeof) - , (nonkey_cols) := NA - ] - # full result DT: - setkeyv(rbind(epi_archive$DT, next_version_DT), key(epi_archive$DT))[] - }, - "locf" = { - # just the old DT; LOCF is built into other methods: - epi_archive$DT - } - ) - new_versions_end <- fill_versions_end - # Update `epi_archive` all at once with simple, error-free operations + - # return below: - epi_archive$DT <- new_DT - epi_archive$versions_end <- new_versions_end - } else { - # Already sufficiently up to date; nothing to do. - } - return(invisible(epi_archive)) -} - - -#' @export -truncate_versions_after <- function(x, ...) { - UseMethod("truncate_versions_after") -} - - -#' Truncate versions after -#' @description Filter to keep only older versions, mutating the archive by -#' potentially reseating but not mutating some fields. `DT` is likely, but not -#' guaranteed, to be copied. Returns the mutated archive -#' [invisibly][base::invisible]. -#' @param epi_archive as in [`epix_truncate_versions_after`] -#' @param max_version as in [`epix_truncate_versions_after`] -truncate_versions_after.epi_archive2 <- function( - epi_archive, - max_version) { - if (!test_set_equal(class(max_version), class(epi_archive$DT$version))) { - cli_abort("`max_version` must have the same classes as `epi_archive$DT$version`.") - } - if (!test_set_equal(typeof(max_version), typeof(epi_archive$DT$version))) { - cli_abort("`max_version` must have the same types as `epi_archive$DT$version`.") - } - assert_scalar(max_version, na.ok = FALSE) - if (max_version > epi_archive$versions_end) { - cli_abort("`max_version` must be at most `epi_archive$versions_end`.") - } - epi_archive$DT <- epi_archive$DT[epi_archive$DT$version <= max_version, colnames(epi_archive$DT), with = FALSE] - # (^ this filter operation seems to always copy the DT, even if it - # keeps every entry; we don't guarantee this behavior in - # documentation, though, so we could change to alias in this case) - if (!is.na(epi_archive$clobberable_versions_start) && epi_archive$clobberable_versions_start > max_version) { - epi_archive$clobberable_versions_start <- NA - } - epi_archive$versions_end <- max_version - return(invisible(epi_archive)) -} - - -#' Merge epi archive -#' @description Merges another `epi_archive` with the current one, mutating the -#' current one by reseating its `DT` and several other fields, but avoiding -#' mutation of the old `DT`; returns the current archive -#' [invisibly][base::invisible]. See [`epix_merge`] for a full description -#' of the non-R6-method version, which does not mutate either archive, and -#' does not alias either archive's `DT`.a -#' @param x as in [`epix_merge`] -#' @param y as in [`epix_merge`] -#' @param sync as in [`epix_merge`] -#' @param compactify as in [`epix_merge`] -merge_epi_archive2 <- function( - x, - y, - sync = c("forbid", "na", "locf", "truncate"), - compactify = TRUE) { - result <- epix_merge(x, y, - sync = sync, - compactify = compactify - ) - - # TODO: Use encapsulating methods instead. - if (length(x$private_fields) != 0L) { - cli_abort("expected no private fields in x", - internal = TRUE - ) - } - - # Mutate fields all at once, trying to avoid any potential errors: - for (field_name in names(x$public_fields)) { - x[[field_name]] <- result[[field_name]] - } - - return(invisible(x)) -} - - -#' `group_by` and related methods for `epi_archive`, `grouped_epi_archive` -#' -#' @param .data An `epi_archive` or `grouped_epi_archive` -#' @param ... Similar to [`dplyr::group_by`] (see "Details:" for edge cases); -#' * For `group_by`: unquoted variable name(s) or other -#' ["data masking"][dplyr::dplyr_data_masking] expression(s). It's possible to -#' use [`dplyr::mutate`]-like syntax here to calculate new columns on which to -#' perform grouping, but note that, if you are regrouping an already-grouped -#' `.data` object, the calculations will be carried out ignoring such grouping -#' (same as [in dplyr][dplyr::group_by]). -#' * For `ungroup`: either -#' * empty, in order to remove the grouping and output an `epi_archive`; or -#' * variable name(s) or other ["tidy-select"][dplyr::dplyr_tidy_select] -#' expression(s), in order to remove the matching variables from the list of -#' grouping variables, and output another `grouped_epi_archive`. -#' @param .add Boolean. If `FALSE`, the default, the output will be grouped by -#' the variable selection from `...` only; if `TRUE`, the output will be -#' grouped by the current grouping variables plus the variable selection from -#' `...`. -#' @param .drop As described in [`dplyr::group_by`]; determines treatment of -#' factor columns. -#' @param x For `groups` or `ungroup`: a `grouped_epi_archive`; for -#' `is_grouped_epi_archive`: any object -#' @param .tbl (For `group_by_drop_default`:) an `epi_archive` or -#' `grouped_epi_archive` (`epi_archive` dispatches to the S3 default method; -#' `grouped_epi_archive` dispatches its own S3 method) -#' -#' @details -#' -#' To match `dplyr`, `group_by` allows "data masking" (also referred to as -#' "tidy evaluation") expressions `...`, not just column names, in a way similar -#' to `mutate`. Note that replacing or removing key columns with these -#' expressions is disabled. -#' -#' `archive %>% group_by()` and other expressions that group or regroup by zero -#' columns (indicating that all rows should be treated as part of one large -#' group) will output a `grouped_epi_archive`, in order to enable the use of -#' `grouped_epi_archive` methods on the result. This is in slight contrast to -#' the same operations on tibbles and grouped tibbles, which will *not* output a -#' `grouped_df` in these circumstances. -#' -#' Using `group_by` with `.add=FALSE` to override the existing grouping is -#' disabled; instead, `ungroup` first then `group_by`. -#' -#' Mutation and aliasing: `group_by` tries to use a shallow copy of the `DT`, -#' introducing column-level aliasing between its input and its result. This -#' doesn't follow the general model for most `data.table` operations, which -#' seems to be that, given an nonaliased (i.e., unique) pointer to a -#' `data.table` object, its pointers to its columns should also be nonaliased. -#' If you mutate any of the columns of either the input or result, first ensure -#' that it is fine if columns of the other are also mutated, but do not rely on -#' such behavior to occur. Additionally, never perform mutation on the key -#' columns at all (except for strictly increasing transformations), as this will -#' invalidate sortedness assumptions about the rows. -#' -#' `group_by_drop_default` on (ungrouped) `epi_archive`s is expected to dispatch -#' to `group_by_drop_default.default` (but there is a dedicated method for -#' `grouped_epi_archive`s). -#' -#' @examples -#' -#' grouped_archive <- archive_cases_dv_subset_2 %>% group_by(geo_value) -#' -#' # `print` for metadata and method listing: -#' grouped_archive %>% print() -#' -#' # The primary use for grouping is to perform a grouped `epix_slide`: -#' -#' archive_cases_dv_subset_2 %>% -#' group_by(geo_value) %>% -#' epix_slide2( -#' f = ~ mean(.x$case_rate_7d_av), -#' before = 2, -#' ref_time_values = as.Date("2020-06-11") + 0:2, -#' new_col_name = "case_rate_3d_av" -#' ) %>% -#' ungroup() -#' -#' # ----------------------------------------------------------------- -#' -#' # Advanced: some other features of dplyr grouping are implemented: -#' -#' library(dplyr) -#' toy_archive <- -#' tribble( -#' ~geo_value, ~age_group, ~time_value, ~version, ~value, -#' "us", "adult", "2000-01-01", "2000-01-02", 121, -#' "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) -#' "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) -#' "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) -#' ) %>% -#' mutate( -#' age_group = ordered(age_group, c("pediatric", "adult")), -#' time_value = as.Date(time_value), -#' version = as.Date(version) -#' ) %>% -#' as_epi_archive2(other_keys = "age_group") -#' -#' # The following are equivalent: -#' toy_archive %>% group_by(geo_value, age_group) -#' toy_archive %>% -#' group_by(geo_value) %>% -#' group_by(age_group, .add = TRUE) -#' grouping_cols <- c("geo_value", "age_group") -#' toy_archive %>% group_by(across(all_of(grouping_cols))) -#' -#' # And these are equivalent: -#' toy_archive %>% group_by(geo_value) -#' toy_archive %>% -#' group_by(geo_value, age_group) %>% -#' ungroup(age_group) -#' -#' # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): -#' toy_archive %>% -#' group_by(geo_value) %>% -#' groups() -#' -#' toy_archive %>% -#' group_by(geo_value, age_group, .drop = FALSE) %>% -#' epix_slide2(f = ~ sum(.x$value), before = 20) %>% -#' ungroup() -#' -#' @importFrom dplyr group_by -#' @export -#' -#' @aliases grouped_epi_archive -group_by.epi_archive2 <- function(epi_archive, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(epi_archive)) { - # `add` makes no difference; this is an ungrouped `epi_archive`. - detailed_mutate <- epix_detailed_restricted_mutate2(epi_archive, ...) - assert_logical(.drop) - if (!.drop) { - grouping_cols <- as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]] - grouping_col_is_factor <- purrr::map_lgl(grouping_cols, is.factor) - # ^ Use `as.list` to try to avoid any possibility of a deep copy. - if (!any(grouping_col_is_factor)) { - cli_warn( - "`.drop=FALSE` but there are no factor grouping columns; - did you mean to convert one of the columns to a factor beforehand?", - class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" - ) - } else if (any(diff(grouping_col_is_factor) == -1L)) { - cli_warn( - "`.drop=FALSE` but there are one or more non-factor grouping columns listed - after a factor grouping column; this may produce groups with `NA`s for these - columns; see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; - depending on how you want completion to work, you might instead want to convert all - grouping columns to factors beforehand, specify the non-factor grouping columns first, - or use `.drop=TRUE` and add a call to `tidyr::complete`.", - class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor" - ) - } - } - new_grouped_epi_archive(detailed_mutate[["archive"]], - detailed_mutate[["request_names"]], - drop = .drop - ) -} - - -#' @export -slide <- function(.data, ...) { - UseMethod("slide") -} - - -#' Slide over epi archive -#' @description Slides a given function over variables in an `epi_archive` -#' object. See the documentation for the wrapper function [`epix_slide()`] for -#' details. The parameter descriptions below are copied from there -#' @importFrom data.table key -#' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms -#' @param f Function, formula, or missing; together with `...` specifies the -#' computation to slide. To "slide" means to apply a computation over a -#' sliding (a.k.a. "rolling") time window for each data group. The window is -#' determined by the `before` parameter described below. One time step is -#' typically one day or one week; see [`epi_slide`] details for more -#' explanation. If a function, `f` must take an `epi_df` with the same -#' column names as the archive's `DT`, minus the `version` column; followed -#' by a one-row tibble containing the values of the grouping variables for -#' the associated group; followed by a reference time value, usually as a -#' `Date` object; followed by any number of named arguments. If a formula, -#' `f` can operate directly on columns accessed via `.x$var` or `.$var`, as -#' in `~ mean (.x$var)` to compute a mean of a column `var` for each -#' group-`ref_time_value` combination. The group key can be accessed via -#' `.y` or `.group_key`, and the reference time value can be accessed via -#' `.z` or `.ref_time_value`. If `f` is missing, then `...` will specify the -#' computation. -#' @param ... Additional arguments to pass to the function or formula specified -#' via `f`. Alternatively, if `f` is missing, then `...` is interpreted as an -#' expression for tidy evaluation; in addition to referring to columns -#' directly by name, the expression has access to `.data` and `.env` pronouns -#' as in `dplyr` verbs, and can also refer to the `.group_key` and -#' `.ref_time_value`. See details of [`epi_slide`]. -#' @param before How far `before` each `ref_time_value` should the sliding -#' window extend? If provided, should be a single, non-NA, -#' [integer-compatible][vctrs::vec_cast] number of time steps. This window -#' endpoint is inclusive. For example, if `before = 7`, and one time step is -#' one day, then to produce a value for a `ref_time_value` of January 8, we -#' apply the given function or formula to data (for each group present) with -#' `time_value`s from January 1 onward, as they were reported on January 8. -#' For typical disease surveillance sources, this will not include any data -#' with a `time_value` of January 8, and, depending on the amount of reporting -#' latency, may not include January 7 or even earlier `time_value`s. (If -#' instead the archive were to hold nowcasts instead of regular surveillance -#' data, then we would indeed expect data for `time_value` January 8. If it -#' were to hold forecasts, then we would expect data for `time_value`s after -#' January 8, and the sliding window would extend as far after each -#' `ref_time_value` as needed to include all such `time_value`s.) -#' @param ref_time_values Reference time values / versions for sliding -#' computations; each element of this vector serves both as the anchor point -#' for the `time_value` window for the computation and the `max_version` -#' `as_of` which we fetch data in this window. If missing, then this will set -#' to a regularly-spaced sequence of values set to cover the range of -#' `version`s in the `DT` plus the `versions_end`; the spacing of values will -#' be guessed (using the GCD of the skips between values). -#' @param time_step Optional function used to define the meaning of one time -#' step, which if specified, overrides the default choice based on the -#' `time_value` column. This function must take a positive integer and return -#' an object of class `lubridate::period`. For example, we can use `time_step -#' = lubridate::hours` in order to set the time step to be one hour (this -#' would only be meaningful if `time_value` is of class `POSIXct`). -#' @param new_col_name String indicating the name of the new column that will -#' contain the derivative values. Default is "slide_value"; note that setting -#' `new_col_name` equal to an existing column name will overwrite this column. -#' @param as_list_col Should the slide results be held in a list column, or be -#' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, -#' in which case a list object returned by `f` would be unnested (using -#' [`tidyr::unnest()`]), and, if the slide computations output data frames, -#' the names of the resulting columns are given by prepending `new_col_name` -#' to the names of the list elements. -#' @param names_sep String specifying the separator to use in `tidyr::unnest()` -#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix -#' from `new_col_name` entirely. -#' @param all_versions (Not the same as `all_rows` parameter of `epi_slide`.) If -#' `all_versions = TRUE`, then `f` will be passed the version history (all -#' `version <= ref_time_value`) for rows having `time_value` between -#' `ref_time_value - before` and `ref_time_value`. Otherwise, `f` will be -#' passed only the most recent `version` for every unique `time_value`. -#' Default is `FALSE`. -slide.epi_archive2 <- function(epi_archive, f, ..., before, ref_time_values, - time_step, new_col_name = "slide_value", - as_list_col = FALSE, names_sep = "_", - all_versions = FALSE) { - # For an "ungrouped" slide, treat all rows as belonging to one big - # group (group by 0 vars), like `dplyr::summarize`, and let the - # resulting `grouped_epi_archive` handle the slide: - slide( - group_by(epi_archive), - f, - ..., - before = before, ref_time_values = ref_time_values, - time_step = time_step, new_col_name = new_col_name, - as_list_col = as_list_col, names_sep = names_sep, - all_versions = all_versions - ) %>% - # We want a slide on ungrouped archives to output something - # ungrouped, rather than retaining the trivial (0-variable) - # grouping applied above. So we `ungroup()`. However, the current - # `dplyr` implementation automatically ignores/drops trivial - # groupings, so this is just a no-op for now. - ungroup() -} - - -#' Convert to `epi_archive` format -#' -#' Converts a data frame, data table, or tibble into an `epi_archive` -#' object. See the [archive -#' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for -#' examples. The parameter descriptions below are copied from there -#' -#' @param x A data frame, data table, or tibble, with columns `geo_value`, -#' `time_value`, `version`, and then any additional number of columns. -#' @param geo_type Type for the geo values. If missing, then the function will -#' attempt to infer it from the geo values present; if this fails, then it -#' will be set to "custom". -#' @param time_type Type for the time values. If missing, then the function will -#' attempt to infer it from the time values present; if this fails, then it -#' will be set to "custom". -#' @param other_keys Character vector specifying the names of variables in `x` -#' that should be considered key variables (in the language of `data.table`) -#' apart from "geo_value", "time_value", and "version". -#' @param additional_metadata List of additional metadata to attach to the -#' `epi_archive` object. The metadata will have `geo_type` and `time_type` -#' fields; named entries from the passed list or will be included as well. -#' @param compactify Optional; Boolean or `NULL`: should we remove rows that are -#' considered redundant for the purposes of `epi_archive`'s built-in methods -#' such as `as_of`? As these methods use the last version of each observation -#' carried forward (LOCF) to interpolate between the version data provided, -#' rows that don't change these LOCF results can potentially be omitted to -#' save space. `TRUE` will remove these rows, `FALSE` will not, and missing or -#' `NULL` will remove these rows and issue a warning. Generally, this can be -#' set to `TRUE`, but if you directly inspect or edit the fields of the -#' `epi_archive` such as its `DT`, you will have to determine whether -#' `compactify=TRUE` will produce the desired results. If compactification -#' here is removing a large proportion of the rows, this may indicate a -#' potential for space, time, or bandwidth savings upstream the data pipeline, -#' e.g., when fetching, storing, or preparing the input data `x` -#' @param clobberable_versions_start Optional; `length`-1; either a value of the -#' same `class` and `typeof` as `x$version`, or an `NA` of any `class` and -#' `typeof`: specifically, either (a) the earliest version that could be -#' subject to "clobbering" (being overwritten with different update data, but -#' using the *same* version tag as the old update data), or (b) `NA`, to -#' indicate that no versions are clobberable. There are a variety of reasons -#' why versions could be clobberable under routine circumstances, such as (a) -#' today's version of one/all of the columns being published after initially -#' being filled with `NA` or LOCF, (b) a buggy version of today's data being -#' published but then fixed and republished later in the day, or (c) data -#' pipeline delays (e.g., publisher uploading, periodic scraping, database -#' syncing, periodic fetching, etc.) that make events (a) or (b) reflected -#' later in the day (or even on a different day) than expected; potential -#' causes vary between different data pipelines. The default value is `NA`, -#' which doesn't consider any versions to be clobberable. Another setting that -#' may be appropriate for some pipelines is `max_version_with_row_in(x)`. -#' @param versions_end Optional; length-1, same `class` and `typeof` as -#' `x$version`: what is the last version we have observed? The default is -#' `max_version_with_row_in(x)`, but values greater than this could also be -#' valid, and would indicate that we observed additional versions of the data -#' beyond `max(x$version)`, but they all contained empty updates. (The default -#' value of `clobberable_versions_start` does not fully trust these empty -#' updates, and assumes that any version `>= max(x$version)` could be -#' clobbered.) If `nrow(x) == 0`, then this argument is mandatory. -#' @return An `epi_archive` object. -#' -#' @details This simply a wrapper around the `new()` method of the `epi_archive` -#' class, so for example: -#' ``` -#' x <- as_epi_archive(df, geo_type = "state", time_type = "day") -#' ``` -#' would be equivalent to: -#' ``` -#' x <- epi_archive$new(df, geo_type = "state", time_type = "day") -#' ``` -#' -#' @export -#' @examples -#' # Simple ex. with necessary keys -#' tib <- tibble::tibble( -#' geo_value = rep(c("ca", "hi"), each = 5), -#' time_value = rep(seq(as.Date("2020-01-01"), -#' by = 1, length.out = 5 -#' ), times = 2), -#' version = rep(seq(as.Date("2020-01-02"), -#' by = 1, length.out = 5 -#' ), times = 2), -#' value = rnorm(10, mean = 2, sd = 1) -#' ) -#' -#' toy_epi_archive <- tib %>% as_epi_archive2( -#' geo_type = "state", -#' time_type = "day" -#' ) -#' toy_epi_archive -#' -#' # Ex. with an additional key for county -#' df <- data.frame( -#' geo_value = c(replicate(2, "ca"), replicate(2, "fl")), -#' county = c(1, 3, 2, 5), -#' time_value = c( -#' "2020-06-01", -#' "2020-06-02", -#' "2020-06-01", -#' "2020-06-02" -#' ), -#' version = c( -#' "2020-06-02", -#' "2020-06-03", -#' "2020-06-02", -#' "2020-06-03" -#' ), -#' cases = c(1, 2, 3, 4), -#' cases_rate = c(0.01, 0.02, 0.01, 0.05) -#' ) -#' -#' x <- df %>% as_epi_archive2( -#' geo_type = "state", -#' time_type = "day", -#' other_keys = "county" -#' ) -as_epi_archive2 <- function(x, geo_type, time_type, other_keys, - additional_metadata = list(), - compactify = NULL, - clobberable_versions_start = NA, - versions_end = max_version_with_row_in(x)) { - new_epi_archive2( - x, geo_type, time_type, other_keys, additional_metadata, - compactify, clobberable_versions_start, versions_end - ) -} - -#' Test for `epi_archive` format -#' -#' @param x An object. -#' @param grouped_okay Optional; Boolean; should a `grouped_epi_archive` also -#' count? Default is `FALSE`. -#' @return `TRUE` if the object inherits from `epi_archive`. -#' -#' @export -#' @examples -#' is_epi_archive2(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive) -#' is_epi_archive2(archive_cases_dv_subset_2) # TRUE -#' -#' # By default, grouped_epi_archives don't count as epi_archives, as they may -#' # support a different set of operations from regular `epi_archives`. This -#' # behavior can be controlled by `grouped_okay`. -#' grouped_archive <- archive_cases_dv_subset_2 %>% group_by(geo_value) -#' is_epi_archive2(grouped_archive) # FALSE -#' is_epi_archive2(grouped_archive, grouped_okay = TRUE) # TRUE -#' -#' @seealso [`is_grouped_epi_archive`] -is_epi_archive2 <- function(x, grouped_okay = FALSE) { - inherits(x, "epi_archive2") || grouped_okay && inherits(x, "grouped_epi_archive2") -} - - -#' @export -clone <- function(x, ...) { - UseMethod("clone") -} - - -#' @export -clone.epi_archive2 <- function(epi_archive, deep = FALSE) { - # TODO: Finish. - if (deep) { - epi_archive$DT <- copy(epi_archive$DT) - } else { - epi_archive$DT <- copy(epi_archive$DT) - } - return(epi_archive) -} diff --git a/R/data.R b/R/data.R index 37ccc522..cbaaa901 100644 --- a/R/data.R +++ b/R/data.R @@ -195,11 +195,11 @@ delayed_assign_with_unregister_awareness <- function(x, value, # Like normal data objects, set `archive_cases_dv_subset` up as a promise, so it # doesn't take unnecessary space before it's evaluated. This also avoids a need # for @include tags. However, this pattern will use unnecessary space after this -# promise is evaluated, because `as_epi_archive` clones `archive_cases_dv_subset_dt` +# promise is evaluated, because `as_epi_archive` copies `archive_cases_dv_subset_dt` # and `archive_cases_dv_subset_dt` will stick around along with `archive_cases_dv_subset` # after they have been evaluated. We may want to add an option to avoid cloning # in `as_epi_archive` and make use of it here. But we may also want to change -# this into an active binding that clones every time, unless we can hide the +# this into an active binding that copies every time, unless we can hide the # `DT` field from the user (make it non-`public` in general) or make it # read-only (in this specific case), so that the user cannot modify the `DT` # here and potentially mess up examples that they refer to later on. @@ -289,11 +289,3 @@ delayed_assign_with_unregister_awareness( #' * Furthermore, the data has been limited to a very small number of rows, the #' signal names slightly altered, and formatted into a tibble. "jhu_csse_county_level_subset" - -#' @export -"archive_cases_dv_subset_2" - -delayed_assign_with_unregister_awareness( - "archive_cases_dv_subset_2", - as_epi_archive2(archive_cases_dv_subset_dt, compactify = FALSE) -) diff --git a/R/group_by_epi_df_methods.R b/R/group_by_epi_df_methods.R index b531178f..949cc914 100644 --- a/R/group_by_epi_df_methods.R +++ b/R/group_by_epi_df_methods.R @@ -11,7 +11,3 @@ select.epi_df <- function(.data, ...) { might_decay <- reclass(selected, attr(selected, "metadata")) return(dplyr_reconstruct(might_decay, might_decay)) } - -# others to consider: -# - arrange -# - diff --git a/R/grouped_archive_new.R b/R/grouped_archive_new.R deleted file mode 100644 index c0e6c35e..00000000 --- a/R/grouped_archive_new.R +++ /dev/null @@ -1,456 +0,0 @@ -#' -#' Convenience function for performing a `tidy_select` on dots according to its -#' docs, and taking the names (rather than the integer indices). -#' -#' @param ... tidyselect-syntax selection description -#' @param .data named vector / data frame; context for the description / the -#' object to which the selections apply -#' @return character vector containing names of entries/columns of -#' `names(.data)` denoting the selection -#' -#' @noRd -eval_pure_select_names_from_dots <- function(..., .data) { - # `?tidyselect::eval_select` tells us to use this form when we take in dots. - # It seems a bit peculiar, since the expr doesn't pack with it a way to get at - # the environment for the dots, but it looks like `eval_select` will assume - # the caller env (our `environment()`) when given an expr, and thus have - # access to the dots. - # - # If we were allowing renaming, we'd need to be careful about which names (new - # vs. old vs. both) to return here. - names(tidyselect::eval_select(rlang::expr(c(...)), .data, allow_rename = FALSE)) -} - -#' Get names of dots without forcing the dots -#' -#' For use in functions that use nonstandard evaluation (NSE) on the dots; we -#' can't use the pattern `names(list(...))` in this case because it will attempt -#' to force/(standard-)evaluate the dots, and we want to avoid attempted forcing of the -#' dots if we're using NSE. -#' -#' @noRd -nse_dots_names <- function(...) { - names(rlang::call_match()) -} -nse_dots_names2 <- function(...) { - rlang::names2(rlang::call_match()) -} - -#' @importFrom dplyr group_by_drop_default -#' @noRd -new_grouped_epi_archive <- function(ungrouped, vars, drop) { - if (inherits(ungrouped, "grouped_epi_archive")) { - cli_abort( - "`ungrouped` must not already be grouped (neither automatic regrouping - nor nested grouping is supported). Either use `group_by` with `.add=TRUE`, - or `ungroup` first.", - class = "epiprocess__grouped_epi_archive__ungrouped_arg_is_already_grouped", - epiprocess__ungrouped_class = class(ungrouped), - epiprocess__ungrouped_groups = groups(ungrouped) - ) - } - assert_class(ungrouped, "epi_archive2") - assert_character(vars) - if (!test_subset(vars, names(ungrouped$DT))) { - cli_abort( - "All grouping variables `vars` must be present in the data.", - ) - } - if ("version" %in% vars) { - cli_abort("`version` has a special interpretation and cannot be used by itself as a grouping variable") - } - assert_logical(drop, len = 1) - - # ----- - private <- list() - private$ungrouped <- ungrouped - private$vars <- vars - private$drop <- drop - - return(structure( - list( - private = private - ), - class = c("grouped_epi_archive2", "epi_archive2") - )) -} - -#' @export -print.grouped_epi_archive2 <- function(grouped_epi_archive, class = TRUE) { - if (class) cat("A `grouped_epi_archive` object:\n") - writeLines(wrap_varnames(grouped_epi_archive$private$vars, initial = "* Groups: ")) - # If none of the grouping vars is a factor, then $drop doesn't seem - # relevant, so try to be less verbose and don't message about it. - # - # Below map-then-extract may look weird, but the more natural - # extract-then-map appears to trigger copies of the extracted columns - # since we are working with a `data.table` (unless we go through - # `as.list`, but its current column-aliasing behavior is probably not - # something to rely too much on), while map functions currently appear - # to avoid column copies. - if (any(purrr::map_lgl(grouped_epi_archive$private$ungrouped$DT, is.factor)[grouped_epi_archive$private$vars])) { - cat(strwrap(init = "* ", prefix = " ", sprintf( - "%s groups formed by factor levels that don't appear in the data", - if (grouped_epi_archive$private$drop) "Drops" else "Does not drop" - ))) - cat("\n") - } - cat("It wraps an ungrouped `epi_archive`, with metadata:\n") - print(grouped_epi_archive$private$ungrouped, class = FALSE) - # Return self invisibly for convenience in `$`-"pipe": - invisible(grouped_epi_archive) -} - -#' @include methods-epi_archive.R -#' @rdname group_by.epi_archive -#' -#' @importFrom dplyr group_by -#' @export -group_by.grouped_epi_archive2 <- function( - grouped_epi_archive, - ..., - .add = FALSE, - .drop = dplyr::group_by_drop_default(grouped_epi_archive)) { - assert_logical(.add, len = 1) - if (!.add) { - cli_abort('`group_by` on a `grouped_epi_archive` with `.add=FALSE` is forbidden - (neither automatic regrouping nor nested grouping is supported). - If you want to "regroup", replacing the existing grouping vars, `ungroup` first and then `group_by`. - If you want to add to the existing grouping vars, call `group_by` specifying `.add=TRUE`. - ', - class = "epiprocess__grouped_epi_archive_group_by_with_add_FALSE" - ) - } else { - # `group_by` `...` computations are performed on ungrouped data (see - # `?dplyr::group_by`) - detailed_mutate <- epix_detailed_restricted_mutate2(grouped_epi_archive$private$ungrouped, ...) - out_ungrouped <- detailed_mutate[["archive"]] - vars_from_dots <- detailed_mutate[["request_names"]] - vars <- union(grouped_epi_archive$private$vars, vars_from_dots) - new_grouped_epi_archive(grouped_epi_archive$private$ungrouped, vars, .drop) - } -} - -#' @include methods-epi_archive.R -#' @rdname group_by.epi_archive -#' -#' @export -group_by_drop_default.grouped_epi_archive2 <- function(grouped_epi_archive) { - grouped_epi_archive$private$drop -} - -#' @include methods-epi_archive.R -#' @rdname group_by.epi_archive -#' -#' @importFrom dplyr groups -#' @export -groups.grouped_epi_archive2 <- function(grouped_epi_archive) { - rlang::syms(grouped_epi_archive$private$vars) -} - -#' @include methods-epi_archive.R -#' @rdname group_by.epi_archive -#' -#' @importFrom dplyr ungroup -#' @export -ungroup.grouped_epi_archive2 <- function(grouped_epi_archive, ...) { - if (rlang::dots_n(...) == 0L) { - # No dots = special behavior: remove all grouping vars and convert to - # an ungrouped class, as with `grouped_df`s. - grouped_epi_archive$private$ungrouped - } else { - exclude_vars <- eval_pure_select_names_from_dots(..., .data = grouped_epi_archive$private$ungrouped$DT) - # (requiring a pure selection here is a little stricter than dplyr - # implementations, but passing a renaming selection into `ungroup` - # seems pretty weird.) - result_vars <- grouped_epi_archive$private$vars[!grouped_epi_archive$private$vars %in% exclude_vars] - # `vars` might be length 0 if the user's tidyselection removed all - # grouping vars. Unlike with tibble, opt here to keep the result as a - # grouped_epi_archive, for output class consistency when `...` is - # provided. - new_grouped_epi_archive(grouped_epi_archive$private$ungrouped, result_vars, grouped_epi_archive$private$drop) - } -} - -#' Truncate versions after a given version, grouped -#' @description Filter to keep only older versions by mutating the underlying -#' `epi_archive` using `$truncate_versions_after`. Returns the mutated -#' `grouped_epi_archive` [invisibly][base::invisible]. -#' @param x as in [`epix_truncate_versions_after`] -#' @param max_version as in [`epix_truncate_versions_after`] -#' @export -truncate_versions_after.grouped_epi_archive2 <- function(grouped_epi_archive, max_version) { - # The grouping is irrelevant for this method; if we were to split into - # groups and recombine appropriately, we should get the same result as - # just leveraging the ungrouped method, so just do the latter: - truncate_versions_after(grouped_epi_archive$private$ungrouped, max_version) - return(invisible(grouped_epi_archive)) -} - -#' Truncate versions after a given version, grouped -#' @export -epix_truncate_versions_after.grouped_epi_archive2 <- function(grouped_epi_archive, max_version) { - cloned_group_epi_archive <- clone(grouped_epi_archive, deep = TRUE) - return((truncate_versions_after(cloned_group_epi_archive, max_version))) - # ^ second set of parens drops invisibility -} - - -#' Slide over grouped epi archive -#' @description Slides a given function over variables in a `grouped_epi_archive` -#' object. See the documentation for the wrapper function [`epix_slide()`] for -#' details. -#' @importFrom data.table key address rbindlist setDF -#' @importFrom tibble as_tibble new_tibble validate_tibble -#' @importFrom dplyr group_by groups -#' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms -#' env missing_arg -#' @export -slide.grouped_epi_archive2 <- function(grouped_epi_archive, f, ..., before, ref_time_values, - time_step, new_col_name = "slide_value", - as_list_col = FALSE, names_sep = "_", - all_versions = FALSE) { - # Perform some deprecated argument checks without using ` = - # deprecated()` in the function signature, because they are from - # early development versions and much more likely to be clutter than - # informative in the signature. - if ("group_by" %in% nse_dots_names(...)) { - cli_abort(" - The `group_by` argument to `slide` has been removed; please use - the `group_by` S3 generic function or `$group_by` R6 method - before the slide instead. (If you were instead trying to pass a - `group_by` argument to `f` or create a column named `group_by`, - this check is a false positive, but you will still need to use a - different column name here and rename the resulting column after - the slide.) - ", class = "epiprocess__epix_slide_group_by_parameter_deprecated") - } - if ("all_rows" %in% nse_dots_names(...)) { - cli_abort(" - The `all_rows` argument has been removed from `epix_slide` (but - is still supported in `epi_slide`). Add rows for excluded - results with a manual join instead. - ", class = "epiprocess__epix_slide_all_rows_parameter_deprecated") - } - - if (missing(ref_time_values)) { - ref_time_values <- epix_slide_ref_time_values_default(grouped_epi_archive$private$ungrouped) - } else { - assert_numeric(ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) - if (any(ref_time_values > grouped_epi_archive$private$ungrouped$versions_end)) { - cli_abort("Some `ref_time_values` are greater than the latest version in the archive.") - } - if (anyDuplicated(ref_time_values) != 0L) { - cli_abort("Some `ref_time_values` are duplicated.") - } - # Sort, for consistency with `epi_slide`, although the current - # implementation doesn't take advantage of it. - ref_time_values <- sort(ref_time_values) - } - - # Validate and pre-process `before`: - if (missing(before)) { - cli_abort("`before` is required (and must be passed by name); - if you did not want to apply a sliding window but rather - to map `as_of` and `f` across various `ref_time_values`, - pass a large `before` value (e.g., if time steps are days, - `before=365000`).") - } - before <- vctrs::vec_cast(before, integer()) - assert_int(before, lower = 0L, null.ok = FALSE, na.ok = FALSE) - - # If a custom time step is specified, then redefine units - - if (!missing(time_step)) before <- time_step(before) - - # Symbolize column name - new_col <- sym(new_col_name) - - # Validate rest of parameters: - assert_logical(as_list_col, len = 1L) - assert_logical(all_versions, len = 1L) - assert_character(names_sep, len = 1L, null.ok = TRUE) - - # Computation for one group, one time value - comp_one_grp <- function(.data_group, .group_key, - f, ..., - ref_time_value, - new_col) { - # Carry out the specified computation - comp_value <- f(.data_group, .group_key, ref_time_value, ...) - - if (all_versions) { - # Extract data from archive so we can do length checks below. When - # `all_versions = TRUE`, `.data_group` will always be an ungrouped - # archive because of the preceding `as_of` step. - .data_group <- .data_group$DT - } - - assert( - check_atomic(comp_value, any.missing = TRUE), - check_data_frame(comp_value), - combine = "or", - .var.name = vname(comp_value) - ) - - # Label every result row with the `ref_time_value` - res <- list(time_value = ref_time_value) - - # Wrap the computation output in a list and unchop/unnest later if - # `as_list_col = FALSE`. This approach means that we will get a - # list-class col rather than a data.frame-class col when - # `as_list_col = TRUE` and the computations outputs are data - # frames. - res[[new_col]] <- list(comp_value) - - # Convert the list to a tibble all at once for speed. - return(validate_tibble(new_tibble(res))) - } - - # If `f` is missing, interpret ... as an expression for tidy evaluation - if (missing(f)) { - quos <- enquos(...) - if (length(quos) == 0) { - cli_abort("If `f` is missing then a computation must be specified via `...`.") - } - if (length(quos) > 1) { - cli_abort("If `f` is missing then only a single computation can be specified via `...`.") - } - - f <- quos[[1]] - new_col <- sym(names(rlang::quos_auto_name(quos))) - ... <- missing_arg() # magic value that passes zero args as dots in calls below - } - - f <- as_slide_computation(f, ...) - x <- lapply(ref_time_values, function(ref_time_value) { - # Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`, - # `epi_archive` if `all_versions` is `TRUE`: - as_of_raw <- as_of(grouped_epi_archive$private$ungrouped, - ref_time_value, - min_time_value = ref_time_value - before, - all_versions = all_versions - ) - - # Set: - # * `as_of_df`, the data.frame/tibble/epi_df/etc. that we will - # `group_modify` as the `.data` argument. Might or might not - # include version column. - # * `group_modify_fn`, the corresponding `.f` argument - if (!all_versions) { - as_of_df <- as_of_raw - group_modify_fn <- comp_one_grp - } else { - as_of_archive <- as_of_raw - # We essentially want to `group_modify` the archive, but - # haven't implemented this method yet. Next best would be - # `group_modify` on its `$DT`, but that has different - # behavior based on whether or not `dtplyr` is loaded. - # Instead, go through an ordinary data frame, trying to avoid - # copies. - if (address(as_of_archive$DT) == address(grouped_epi_archive$private$ungrouped$DT)) { - # `as_of` aliased its the full `$DT`; copy before mutating: - # - # Note: this step is probably unneeded; we're fine with - # aliasing of the DT or its columns: vanilla operations aren't - # going to mutate them in-place if they are aliases, and we're - # not performing mutation (unlike the situation with - # `fill_through_version` where we do mutate a `DT` and don't - # want aliasing). - as_of_archive$DT <- copy(as_of_archive$DT) - } - dt_key <- data.table::key(as_of_archive$DT) - as_of_df <- as_of_archive$DT - data.table::setDF(as_of_df) - - # Convert each subgroup chunk to an archive before running the calculation. - group_modify_fn <- function(.data_group, .group_key, - f, ..., - ref_time_value, - new_col) { - # .data_group is coming from as_of_df as a tibble, but we - # want to feed `comp_one_grp` an `epi_archive` backed by a - # DT; convert and wrap: - data.table::setattr(.data_group, "sorted", dt_key) - data.table::setDT(.data_group, key = dt_key) - .data_group_archive <- clone(as_of_archive) - .data_group_archive$DT <- .data_group - comp_one_grp(.data_group_archive, .group_key, - f = f, ..., - ref_time_value = ref_time_value, - new_col = new_col - ) - } - } - - return( - dplyr::group_modify( - dplyr::group_by(as_of_df, !!!syms(grouped_epi_archive$private$vars), .drop = grouped_epi_archive$private$drop), - group_modify_fn, - f = f, ..., - ref_time_value = ref_time_value, - new_col = new_col, - .keep = TRUE - ) - ) - }) - # Combine output into a single tibble - x <- as_tibble(setDF(rbindlist(x))) - # Reconstruct groups - x <- group_by(x, !!!syms(grouped_epi_archive$private$vars), .drop = grouped_epi_archive$private$drop) - - # Unchop/unnest if we need to - if (!as_list_col) { - x <- tidyr::unnest(x, !!new_col, names_sep = names_sep) - } - - # if (is_epi_df(x)) { - # # The analogue of `epi_df`'s `as_of` metadata for an archive is - # # `$versions_end`, at least in the current absence of - # # separate fields/columns denoting the "archive version" with a - # # different resolution, or from the perspective of a different - # # stage of a data pipeline. The `as_of` that is automatically - # # derived won't always match; override: - # attr(x, "metadata")[["as_of"]] <- private$ungrouped$versions_end - # } - - # XXX We need to work out when we want to return an `epi_df` and how - # to get appropriate keys (see #290, #223, #163). We'll probably - # need the commented-out code above if we ever output an `epi_df`. - # However, as a stopgap measure to have some more consistency across - # different ways of calling `epix_slide`, and to prevent `epi_df` - # output with invalid metadata, always output a (grouped or - # ungrouped) tibble. - x <- decay_epi_df(x) - - return(x) -} - - -# At time of writing, roxygen parses content in collation order, impacting the -# presentation of .Rd files that document multiple functions (see -# https://github.com/r-lib/roxygen2/pull/324). Use @include tags (determining -# `Collate:`) and ordering of functions within each file in order to get the -# desired ordering. - - - -#' @include methods-epi_archive.R -#' @rdname group_by.epi_archive -#' -#' @export -is_grouped_epi_archive2 <- function(x) { - inherits(x, "grouped_epi_archive2") -} - - -#' @export -clone.grouped_epi_archive2 <- function(x, deep = FALSE) { - # TODO: Finish. - if (deep) { - ungrouped <- clone(x$private$ungrouped, deep = TRUE) - } else { - ungrouped <- x$private$ungrouped - } - new_grouped_epi_archive(ungrouped, x$private$vars, x$private$drop) -} diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 02722c91..140ff9d3 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -1,3 +1,9 @@ +# At time of writing, roxygen parses content in collation order, impacting the +# presentation of .Rd files that document multiple functions (see +# https://github.com/r-lib/roxygen2/pull/324). We use @include tags (determining +# `Collate:`) below to get the desired ordering. + + #' Get var names from select-only `tidy_select`ing `...` in `.data` #' #' Convenience function for performing a `tidy_select` on dots according to its @@ -22,6 +28,7 @@ eval_pure_select_names_from_dots <- function(..., .data) { names(tidyselect::eval_select(rlang::expr(c(...)), .data, allow_rename = FALSE)) } + #' Get names of dots without forcing the dots #' #' For use in functions that use nonstandard evaluation (NSE) on the dots; we @@ -37,411 +44,388 @@ nse_dots_names2 <- function(...) { rlang::names2(rlang::call_match()) } + #' @importFrom dplyr group_by_drop_default #' @noRd -grouped_epi_archive <- - R6::R6Class( - classname = "grouped_epi_archive", - # (We don't R6-inherit `epi_archive` or S3-multiclass with "epi_archive"; - # any "inheritance" of functionality must be done via wrapper functions that - # are checked/tested for sensible operation.) - private = list( - ungrouped = NULL, - vars = NULL, - drop = NULL - ), - public = list( - initialize = function(ungrouped, vars, drop) { - if (inherits(ungrouped, "grouped_epi_archive")) { - cli_abort( - "`ungrouped` must not already be grouped (neither automatic regrouping - nor nested grouping is supported). - Either use `group_by` with `.add=TRUE`, or `ungroup` first.", - class = "epiprocess__grouped_epi_archive__ungrouped_arg_is_already_grouped", - epiprocess__ungrouped_class = class(ungrouped), - epiprocess__ungrouped_groups = groups(ungrouped) - ) - } - assert_class(ungrouped, "epi_archive") - assert_character(vars) - if (!test_subset(vars, names(ungrouped$DT))) { - cli_abort( - "All grouping variables `vars` must be present in the data.", - ) - } - if ("version" %in% vars) { - cli_abort("`version` has a special interpretation and cannot be used by itself as a grouping variable") - } - assert_logical(drop, len = 1) - - # ----- - private$ungrouped <- ungrouped - private$vars <- vars - private$drop <- drop - }, - print = function(class = TRUE, methods = TRUE) { - if (class) cat("A `grouped_epi_archive` object:\n") - writeLines(wrap_varnames(private$vars, initial = "* Groups: ")) - # If none of the grouping vars is a factor, then $drop doesn't seem - # relevant, so try to be less verbose and don't message about it. - # - # Below map-then-extract may look weird, but the more natural - # extract-then-map appears to trigger copies of the extracted columns - # since we are working with a `data.table` (unless we go through - # `as.list`, but its current column-aliasing behavior is probably not - # something to rely too much on), while map functions currently appear - # to avoid column copies. - if (any(purrr::map_lgl(private$ungrouped$DT, is.factor)[private$vars])) { - cat(strwrap(init = "* ", prefix = " ", sprintf( - "%s groups formed by factor levels that don't appear in the data", - if (private$drop) "Drops" else "Does not drop" - ))) - cat("\n") - } - cat("It wraps an ungrouped `epi_archive`, with metadata:\n") - private$ungrouped$print(class = FALSE, methods = FALSE) - if (methods) { - cat("----------\n") - cat("Public `grouped_epi_archive` R6 methods:\n") - grouped_method_names <- names(grouped_epi_archive$public_methods) - ungrouped_method_names <- names(epi_archive$public_methods) - writeLines(wrap_varnames( - initial = "\u2022 Specialized `epi_archive` methods: ", - intersect(grouped_method_names, ungrouped_method_names) - )) - writeLines(wrap_varnames( - initial = "\u2022 Exclusive to `grouped_epi_archive`: ", - setdiff(grouped_method_names, ungrouped_method_names) - )) - writeLines(wrap_varnames( - initial = "\u2022 `ungroup` to use: ", - setdiff(ungrouped_method_names, grouped_method_names) - )) - } - # Return self invisibly for convenience in `$`-"pipe": - invisible(self) - }, - group_by = function(..., .add = FALSE, .drop = dplyr::group_by_drop_default(self)) { - assert_logical(.add, len = 1) - if (!.add) { - cli_abort('`group_by` on a `grouped_epi_archive` with `.add=FALSE` is forbidden - (neither automatic regrouping nor nested grouping is supported). - If you want to "regroup", replacing the existing grouping vars, `ungroup` first and then `group_by`. - If you want to add to the existing grouping vars, call `group_by` specifying `.add=TRUE`. - ', - class = "epiprocess__grouped_epi_archive_group_by_with_add_FALSE" - ) - } else { - # `group_by` `...` computations are performed on ungrouped data (see - # `?dplyr::group_by`) - detailed_mutate <- epix_detailed_restricted_mutate(private$ungrouped, ...) - out_ungrouped <- detailed_mutate[["archive"]] - vars_from_dots <- detailed_mutate[["request_names"]] - vars <- union(private$vars, vars_from_dots) - grouped_epi_archive$new(private$ungrouped, vars, .drop) - } - }, - group_by_drop_default = function() { - private$drop - }, - groups = function() { - rlang::syms(private$vars) - }, - ungroup = function(...) { - if (rlang::dots_n(...) == 0L) { - # No dots = special behavior: remove all grouping vars and convert to - # an ungrouped class, as with `grouped_df`s. - private$ungrouped - } else { - exclude_vars <- eval_pure_select_names_from_dots(..., .data = private$ungrouped$DT) - # (requiring a pure selection here is a little stricter than dplyr - # implementations, but passing a renaming selection into `ungroup` - # seems pretty weird.) - result_vars <- private$vars[!private$vars %in% exclude_vars] - # `vars` might be length 0 if the user's tidyselection removed all - # grouping vars. Unlike with tibble, opt here to keep the result as a - # grouped_epi_archive, for output class consistency when `...` is - # provided. - grouped_epi_archive$new(private$ungrouped, result_vars, private$drop) - } - }, - #' @description Filter to keep only older versions by mutating the underlying - #' `epi_archive` using `$truncate_versions_after`. Returns the mutated - #' `grouped_epi_archive` [invisibly][base::invisible]. - #' @param x as in [`epix_truncate_versions_after`] - #' @param max_version as in [`epix_truncate_versions_after`] - truncate_versions_after = function(max_version) { - # The grouping is irrelevant for this method; if we were to split into - # groups and recombine appropriately, we should get the same result as - # just leveraging the ungrouped method, so just do the latter: - private$ungrouped$truncate_versions_after(max_version) - return(invisible(self)) - }, - #' @description Slides a given function over variables in a `grouped_epi_archive` - #' object. See the documentation for the wrapper function [`epix_slide()`] for - #' details. - #' @importFrom data.table key address rbindlist setDF - #' @importFrom tibble as_tibble new_tibble validate_tibble - #' @importFrom dplyr group_by groups - #' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms - #' env missing_arg - slide = function(f, ..., before, ref_time_values, - time_step, new_col_name = "slide_value", - as_list_col = FALSE, names_sep = "_", - all_versions = FALSE) { - # Perform some deprecated argument checks without using ` = - # deprecated()` in the function signature, because they are from - # early development versions and much more likely to be clutter than - # informative in the signature. - if ("group_by" %in% nse_dots_names(...)) { - cli_abort(" - The `group_by` argument to `slide` has been removed; please use - the `group_by` S3 generic function or `$group_by` R6 method - before the slide instead. (If you were instead trying to pass a - `group_by` argument to `f` or create a column named `group_by`, - this check is a false positive, but you will still need to use a - different column name here and rename the resulting column after - the slide.) - ", class = "epiprocess__epix_slide_group_by_parameter_deprecated") - } - if ("all_rows" %in% nse_dots_names(...)) { - cli_abort(" - The `all_rows` argument has been removed from `epix_slide` (but - is still supported in `epi_slide`). Add rows for excluded - results with a manual join instead. - ", class = "epiprocess__epix_slide_all_rows_parameter_deprecated") - } - - if (missing(ref_time_values)) { - ref_time_values <- epix_slide_ref_time_values_default(private$ungrouped) - } else { - assert_numeric(ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) - if (any(ref_time_values > private$ungrouped$versions_end)) { - cli_abort("Some `ref_time_values` are greater than the latest version in the archive.") - } - if (anyDuplicated(ref_time_values) != 0L) { - cli_abort("Some `ref_time_values` are duplicated.") - } - # Sort, for consistency with `epi_slide`, although the current - # implementation doesn't take advantage of it. - ref_time_values <- sort(ref_time_values) - } - - # Validate and pre-process `before`: - if (missing(before)) { - cli_abort("`before` is required (and must be passed by name); - if you did not want to apply a sliding window but rather - to map `as_of` and `f` across various `ref_time_values`, - pass a large `before` value (e.g., if time steps are days, - `before=365000`).") - } - before <- vctrs::vec_cast(before, integer()) - assert_int(before, lower = 0L, null.ok = FALSE, na.ok = FALSE) - - # If a custom time step is specified, then redefine units - - if (!missing(time_step)) before <- time_step(before) - - # Symbolize column name - new_col <- sym(new_col_name) - - # Validate rest of parameters: - assert_logical(as_list_col, len = 1L) - assert_logical(all_versions, len = 1L) - assert_character(names_sep, len = 1L, null.ok = TRUE) - - # Computation for one group, one time value - comp_one_grp <- function(.data_group, .group_key, - f, ..., - ref_time_value, - new_col) { - # Carry out the specified computation - comp_value <- f(.data_group, .group_key, ref_time_value, ...) - - if (all_versions) { - # Extract data from archive so we can do length checks below. When - # `all_versions = TRUE`, `.data_group` will always be an ungrouped - # archive because of the preceding `as_of` step. - .data_group <- .data_group$DT - } - - assert( - check_atomic(comp_value, any.missing = TRUE), - check_data_frame(comp_value), - combine = "or", - .var.name = vname(comp_value) - ) - - # Label every result row with the `ref_time_value` - res <- list(time_value = ref_time_value) - - # Wrap the computation output in a list and unchop/unnest later if - # `as_list_col = FALSE`. This approach means that we will get a - # list-class col rather than a data.frame-class col when - # `as_list_col = TRUE` and the computations outputs are data - # frames. - res[[new_col]] <- list(comp_value) - - # Convert the list to a tibble all at once for speed. - return(validate_tibble(new_tibble(res))) - } - - # If `f` is missing, interpret ... as an expression for tidy evaluation - if (missing(f)) { - quos <- enquos(...) - if (length(quos) == 0) { - cli_abort("If `f` is missing then a computation must be specified via `...`.") - } - if (length(quos) > 1) { - cli_abort("If `f` is missing then only a single computation can be specified via `...`.") - } - - f <- quos[[1]] - new_col <- sym(names(rlang::quos_auto_name(quos))) - ... <- missing_arg() # magic value that passes zero args as dots in calls below - } - - f <- as_slide_computation(f, ...) - x <- lapply(ref_time_values, function(ref_time_value) { - # Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`, - # `epi_archive` if `all_versions` is `TRUE`: - as_of_raw <- private$ungrouped$as_of( - ref_time_value, - min_time_value = ref_time_value - before, - all_versions = all_versions - ) - - # Set: - # * `as_of_df`, the data.frame/tibble/epi_df/etc. that we will - # `group_modify` as the `.data` argument. Might or might not - # include version column. - # * `group_modify_fn`, the corresponding `.f` argument - if (!all_versions) { - as_of_df <- as_of_raw - group_modify_fn <- comp_one_grp - } else { - as_of_archive <- as_of_raw - # We essentially want to `group_modify` the archive, but - # haven't implemented this method yet. Next best would be - # `group_modify` on its `$DT`, but that has different - # behavior based on whether or not `dtplyr` is loaded. - # Instead, go through an ordinary data frame, trying to avoid - # copies. - if (address(as_of_archive$DT) == address(private$ungrouped$DT)) { - # `as_of` aliased its the full `$DT`; copy before mutating: - # - # Note: this step is probably unneeded; we're fine with - # aliasing of the DT or its columns: vanilla operations aren't - # going to mutate them in-place if they are aliases, and we're - # not performing mutation (unlike the situation with - # `fill_through_version` where we do mutate a `DT` and don't - # want aliasing). - as_of_archive$DT <- copy(as_of_archive$DT) - } - dt_key <- data.table::key(as_of_archive$DT) - as_of_df <- as_of_archive$DT - data.table::setDF(as_of_df) - - # Convert each subgroup chunk to an archive before running the calculation. - group_modify_fn <- function(.data_group, .group_key, - f, ..., - ref_time_value, - new_col) { - # .data_group is coming from as_of_df as a tibble, but we - # want to feed `comp_one_grp` an `epi_archive` backed by a - # DT; convert and wrap: - data.table::setattr(.data_group, "sorted", dt_key) - data.table::setDT(.data_group, key = dt_key) - .data_group_archive <- as_of_archive$clone() - .data_group_archive$DT <- .data_group - comp_one_grp(.data_group_archive, .group_key, - f = f, ..., - ref_time_value = ref_time_value, - new_col = new_col - ) - } - } - - return( - dplyr::group_modify( - dplyr::group_by(as_of_df, !!!syms(private$vars), .drop = private$drop), - group_modify_fn, - f = f, ..., - ref_time_value = ref_time_value, - new_col = new_col, - .keep = TRUE - ) - ) - }) - # Combine output into a single tibble - x <- as_tibble(setDF(rbindlist(x))) - # Reconstruct groups - x <- group_by(x, !!!syms(private$vars), .drop = private$drop) - - # Unchop/unnest if we need to - if (!as_list_col) { - x <- tidyr::unnest(x, !!new_col, names_sep = names_sep) - } - - # nolint start: commented_code_linter. - # if (is_epi_df(x)) { - # # The analogue of `epi_df`'s `as_of` metadata for an archive is - # # `$versions_end`, at least in the current absence of - # # separate fields/columns denoting the "archive version" with a - # # different resolution, or from the perspective of a different - # # stage of a data pipeline. The `as_of` that is automatically - # # derived won't always match; override: - # attr(x, "metadata")[["as_of"]] <- private$ungrouped$versions_end - # } - # nolint end - - # XXX We need to work out when we want to return an `epi_df` and how - # to get appropriate keys (see #290, #223, #163). We'll probably - # need the commented-out code above if we ever output an `epi_df`. - # However, as a stopgap measure to have some more consistency across - # different ways of calling `epix_slide`, and to prevent `epi_df` - # output with invalid metadata, always output a (grouped or - # ungrouped) tibble. - x <- decay_epi_df(x) - - return(x) - } +new_grouped_epi_archive <- function(x, vars, drop) { + if (inherits(x, "grouped_epi_archive")) { + cli_abort( + "`ungrouped` must not already be grouped (neither automatic regrouping + nor nested grouping is supported). Either use `group_by` with `.add=TRUE`, + or `ungroup` first.", + class = "epiprocess__grouped_epi_archive__ungrouped_arg_is_already_grouped", + epiprocess__ungrouped_class = class(x), + epiprocess__ungrouped_groups = groups(x) + ) + } + assert_class(x, "epi_archive") + assert_character(vars) + if (!test_subset(vars, names(x$DT))) { + cli_abort( + "All grouping variables `vars` must be present in the data.", ) - ) + } + if ("version" %in% vars) { + cli_abort("`version` has a special interpretation and cannot be used by itself as a grouping variable") + } + assert_logical(drop, len = 1) + + # ----- + private <- list() + private$ungrouped <- x + private$vars <- vars + private$drop <- drop + + return(structure( + list( + private = private + ), + class = c("grouped_epi_archive", "epi_archive") + )) +} + + +#' @export +print.grouped_epi_archive <- function(x, ..., class = TRUE) { + if (rlang::dots_n(...) > 0) { + cli_abort(c( + "Error in print.grouped_epi_archive()", + "i" = "Too many arguments passed to `print.grouped_epi_archive()`." + )) + } + + if (class) cat("A `grouped_epi_archive` object:\n") + writeLines(wrap_varnames(x$private$vars, initial = "* Groups: ")) + # If none of the grouping vars is a factor, then $drop doesn't seem + # relevant, so try to be less verbose and don't message about it. + # + # Below map-then-extract may look weird, but the more natural + # extract-then-map appears to trigger copies of the extracted columns + # since we are working with a `data.table` (unless we go through + # `as.list`, but its current column-aliasing behavior is probably not + # something to rely too much on), while map functions currently appear + # to avoid column copies. + if (any(purrr::map_lgl(x$private$ungrouped$DT, is.factor)[x$private$vars])) { + cat(strwrap(initial = "* ", prefix = " ", sprintf( + "%s groups formed by factor levels that don't appear in the data", + if (x$private$drop) "Drops" else "Does not drop" + ))) + cat("\n") + } + cat("It wraps an ungrouped `epi_archive`, with metadata:\n") + print(x$private$ungrouped, class = FALSE) + # Return self invisibly for convenience in `$`-"pipe": + invisible(x) +} -# At time of writing, roxygen parses content in collation order, impacting the -# presentation of .Rd files that document multiple functions (see -# https://github.com/r-lib/roxygen2/pull/324). Use @include tags (determining -# `Collate:`) and ordering of functions within each file in order to get the -# desired ordering. #' @include methods-epi_archive.R #' @rdname group_by.epi_archive #' #' @importFrom dplyr group_by #' @export -group_by.grouped_epi_archive <- function(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) { - .data$group_by(..., .add = .add, .drop = .drop) +group_by.grouped_epi_archive <- function( + .data, + ..., + .add = FALSE, + .drop = dplyr::group_by_drop_default(.data)) { + assert_logical(.add, len = 1) + if (!.add) { + cli_abort('`group_by` on a `grouped_epi_archive` with `.add=FALSE` is forbidden + (neither automatic regrouping nor nested grouping is supported). + If you want to "regroup", replacing the existing grouping vars, `ungroup` first and then `group_by`. + If you want to add to the existing grouping vars, call `group_by` specifying `.add=TRUE`. + ', + class = "epiprocess__grouped_epi_archive_group_by_with_add_FALSE" + ) + } else { + # `group_by` `...` computations are performed on ungrouped data (see + # `?dplyr::group_by`) + detailed_mutate <- epix_detailed_restricted_mutate(.data$private$ungrouped, ...) + out_ungrouped <- detailed_mutate[["archive"]] + vars_from_dots <- detailed_mutate[["request_names"]] + vars <- union(.data$private$vars, vars_from_dots) + new_grouped_epi_archive(out_ungrouped, vars, .drop) + } } + +#' @include methods-epi_archive.R +#' @rdname group_by.epi_archive +#' +#' @param .tbl A `grouped_epi_archive` object. +#' +#' @export +group_by_drop_default.grouped_epi_archive <- function(.tbl) { + x <- .tbl + x$private$drop +} + + #' @include methods-epi_archive.R #' @rdname group_by.epi_archive #' #' @importFrom dplyr groups #' @export groups.grouped_epi_archive <- function(x) { - x$groups() + rlang::syms(x$private$vars) } + #' @include methods-epi_archive.R #' @rdname group_by.epi_archive #' #' @importFrom dplyr ungroup #' @export ungroup.grouped_epi_archive <- function(x, ...) { - x$ungroup(...) + if (rlang::dots_n(...) == 0L) { + # No dots = special behavior: remove all grouping vars and convert to + # an ungrouped class, as with `grouped_df`s. + x$private$ungrouped + } else { + exclude_vars <- eval_pure_select_names_from_dots(..., .data = x$private$ungrouped$DT) + # (requiring a pure selection here is a little stricter than dplyr + # implementations, but passing a renaming selection into `ungroup` + # seems pretty weird.) + result_vars <- x$private$vars[!x$private$vars %in% exclude_vars] + # `vars` might be length 0 if the user's tidyselection removed all + # grouping vars. Unlike with tibble, opt here to keep the result as a + # grouped_epi_archive, for output class consistency when `...` is + # provided. + new_grouped_epi_archive(x$private$ungrouped, result_vars, x$private$drop) + } } + +#' @rdname epix_slide +#' +#' @importFrom data.table key address rbindlist setDF copy +#' @importFrom tibble as_tibble new_tibble validate_tibble +#' @importFrom dplyr group_by groups +#' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms +#' env missing_arg +#' @export +epix_slide.grouped_epi_archive <- function(x, f, ..., before, ref_time_values, + time_step, new_col_name = "slide_value", + as_list_col = FALSE, names_sep = "_", + all_versions = FALSE) { + # Perform some deprecated argument checks without using ` = + # deprecated()` in the function signature, because they are from + # early development versions and much more likely to be clutter than + # informative in the signature. + if ("group_by" %in% nse_dots_names(...)) { + cli_abort(" + The `group_by` argument to `slide` has been removed; please use + the `group_by` S3 generic function or `$group_by` R6 method + before the slide instead. (If you were instead trying to pass a + `group_by` argument to `f` or create a column named `group_by`, + this check is a false positive, but you will still need to use a + different column name here and rename the resulting column after + the slide.) + ", class = "epiprocess__epix_slide_group_by_parameter_deprecated") + } + if ("all_rows" %in% nse_dots_names(...)) { + cli_abort(" + The `all_rows` argument has been removed from `epix_slide` (but + is still supported in `epi_slide`). Add rows for excluded + results with a manual join instead. + ", class = "epiprocess__epix_slide_all_rows_parameter_deprecated") + } + + if (missing(ref_time_values)) { + ref_time_values <- epix_slide_ref_time_values_default(x$private$ungrouped) + } else { + assert_numeric(ref_time_values, min.len = 1L, null.ok = FALSE, any.missing = FALSE) + if (any(ref_time_values > x$private$ungrouped$versions_end)) { + cli_abort("Some `ref_time_values` are greater than the latest version in the archive.") + } + if (anyDuplicated(ref_time_values) != 0L) { + cli_abort("Some `ref_time_values` are duplicated.") + } + # Sort, for consistency with `epi_slide`, although the current + # implementation doesn't take advantage of it. + ref_time_values <- sort(ref_time_values) + } + + # Validate and pre-process `before`: + if (missing(before)) { + cli_abort("`before` is required (and must be passed by name); + if you did not want to apply a sliding window but rather + to map `epix_as_of` and `f` across various `ref_time_values`, + pass a large `before` value (e.g., if time steps are days, + `before=365000`).") + } + before <- vctrs::vec_cast(before, integer()) + assert_int(before, lower = 0L, null.ok = FALSE, na.ok = FALSE) + + # If a custom time step is specified, then redefine units + + if (!missing(time_step)) before <- time_step(before) + + # Symbolize column name + new_col <- sym(new_col_name) + + # Validate rest of parameters: + assert_logical(as_list_col, len = 1L) + assert_logical(all_versions, len = 1L) + assert_character(names_sep, len = 1L, null.ok = TRUE) + + # Computation for one group, one time value + comp_one_grp <- function(.data_group, .group_key, + f, ..., + ref_time_value, + new_col) { + # Carry out the specified computation + comp_value <- f(.data_group, .group_key, ref_time_value, ...) + + if (all_versions) { + # Extract data from archive so we can do length checks below. When + # `all_versions = TRUE`, `.data_group` will always be an ungrouped + # archive because of the preceding `epix_as_of` step. + .data_group <- .data_group$DT + } + + assert( + check_atomic(comp_value, any.missing = TRUE), + check_data_frame(comp_value), + combine = "or", + .var.name = vname(comp_value) + ) + + # Label every result row with the `ref_time_value` + res <- list(time_value = ref_time_value) + + # Wrap the computation output in a list and unchop/unnest later if + # `as_list_col = FALSE`. This approach means that we will get a + # list-class col rather than a data.frame-class col when + # `as_list_col = TRUE` and the computations outputs are data + # frames. + res[[new_col]] <- list(comp_value) + + # Convert the list to a tibble all at once for speed. + return(validate_tibble(new_tibble(res))) + } + + # If `f` is missing, interpret ... as an expression for tidy evaluation + if (missing(f)) { + quos <- enquos(...) + if (length(quos) == 0) { + cli_abort("If `f` is missing then a computation must be specified via `...`.") + } + if (length(quos) > 1) { + cli_abort("If `f` is missing then only a single computation can be specified via `...`.") + } + + f <- quos[[1]] + new_col <- sym(names(rlang::quos_auto_name(quos))) + ... <- missing_arg() # nolint: object_usage_linter. magic value that passes zero args as dots in calls below + } + + f <- as_slide_computation(f, ...) + out <- lapply(ref_time_values, function(ref_time_value) { + # Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`, + # `epi_archive` if `all_versions` is `TRUE`: + as_of_raw <- x$private$ungrouped %>% epix_as_of( + ref_time_value, + min_time_value = ref_time_value - before, + all_versions = all_versions + ) + + # Set: + # * `as_of_df`, the data.frame/tibble/epi_df/etc. that we will + # `group_modify` as the `.data` argument. Might or might not + # include version column. + # * `group_modify_fn`, the corresponding `.f` argument + if (!all_versions) { + as_of_df <- as_of_raw + group_modify_fn <- comp_one_grp + } else { + as_of_archive <- as_of_raw + # We essentially want to `group_modify` the archive, but + # haven't implemented this method yet. Next best would be + # `group_modify` on its `$DT`, but that has different + # behavior based on whether or not `dtplyr` is loaded. + # Instead, go through an ordinary data frame, trying to avoid + # copies. + if (address(as_of_archive$DT) == address(x$private$ungrouped$DT)) { + # `as_of` aliased its the full `$DT`; copy before mutating: + # + # Note: this step is probably unneeded; we're fine with + # aliasing of the DT or its columns: vanilla operations aren't + # going to mutate them in-place if they are aliases, and we're + # not performing mutation. + as_of_archive$DT <- data.table::copy(as_of_archive$DT) + } + dt_key <- data.table::key(as_of_archive$DT) + as_of_df <- as_of_archive$DT + data.table::setDF(as_of_df) + + # Convert each subgroup chunk to an archive before running the calculation. + group_modify_fn <- function(.data_group, .group_key, + f, ..., + ref_time_value, + new_col) { + # .data_group is coming from as_of_df as a tibble, but we + # want to feed `comp_one_grp` an `epi_archive` backed by a + # DT; convert and wrap: + data.table::setattr(.data_group, "sorted", dt_key) + data.table::setDT(.data_group, key = dt_key) + .data_group_archive <- as_of_archive %>% clone() + .data_group_archive$DT <- .data_group + comp_one_grp(.data_group_archive, .group_key, + f = f, ..., + ref_time_value = ref_time_value, + new_col = new_col + ) + } + } + + return( + dplyr::group_modify( + dplyr::group_by(as_of_df, !!!syms(x$private$vars), .drop = x$private$drop), + group_modify_fn, + f = f, ..., + ref_time_value = ref_time_value, + new_col = new_col, + .keep = TRUE + ) + ) + }) + # Combine output into a single tibble + out <- as_tibble(setDF(rbindlist(out))) + # Reconstruct groups + out <- group_by(out, !!!syms(x$private$vars), .drop = x$private$drop) + + # Unchop/unnest if we need to + if (!as_list_col) { + out <- tidyr::unnest(out, !!new_col, names_sep = names_sep) + } + + # nolint start: commented_code_linter. + # if (is_epi_df(x)) { + # # The analogue of `epi_df`'s `as_of` metadata for an archive is + # # `$versions_end`, at least in the current absence of + # # separate fields/columns denoting the "archive version" with a + # # different resolution, or from the perspective of a different + # # stage of a data pipeline. The `as_of` that is automatically + # # derived won't always match; override: + # attr(x, "metadata")[["as_of"]] <- private$ungrouped$versions_end + # } + # nolint end + + # XXX We need to work out when we want to return an `epi_df` and how + # to get appropriate keys (see #290, #223, #163). We'll probably + # need the commented-out code above if we ever output an `epi_df`. + # However, as a stopgap measure to have some more consistency across + # different ways of calling `epix_slide`, and to prevent `epi_df` + # output with invalid metadata, always output a (grouped or + # ungrouped) tibble. + out <- decay_epi_df(out) + + return(out) +} + + #' @include methods-epi_archive.R #' @rdname group_by.epi_archive #' @@ -450,16 +434,20 @@ is_grouped_epi_archive <- function(x) { inherits(x, "grouped_epi_archive") } -#' @include methods-epi_archive.R -#' @rdname group_by.epi_archive -#' + #' @export -group_by_drop_default.grouped_epi_archive <- function(.tbl) { - .tbl$group_by_drop_default() +clone.grouped_epi_archive <- function(x, ...) { + ungrouped <- x$private$ungrouped %>% clone() + new_grouped_epi_archive(ungrouped, x$private$vars, x$private$drop) } + +#' @rdname epix_truncate_versions_after #' @export epix_truncate_versions_after.grouped_epi_archive <- function(x, max_version) { - return((x$clone()$truncate_versions_after(max_version))) - # ^ second set of parens drops invisibility + # The grouping is irrelevant for this method; if we were to split into + # groups and recombine appropriately, we should get the same result as + # just leveraging the ungrouped method, so just do the latter: + x$private$ungrouped <- epix_truncate_versions_after(x$private$ungrouped, max_version) + x } diff --git a/R/growth_rate.R b/R/growth_rate.R index a60db452..f2b326a1 100644 --- a/R/growth_rate.R +++ b/R/growth_rate.R @@ -249,7 +249,9 @@ growth_rate <- function(x = seq_along(y), y, x0 = x, # Estimate growth rate and return f <- genlasso::coef.genlasso(obj, df = df)$beta - d <- extend_r(diff(f) / diff(x)) + d <- diff(f) / diff(x) + # Extend by one element + d <- c(d, d[length(d)]) if (log_scale) { return(d[i0]) } else { diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 213cf1b1..f6846488 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -22,36 +22,17 @@ #' for the `max_version` of each `time_value`. Default is `FALSE`. #' @return An `epi_df` object. #' -#' @details This is simply a wrapper around the `as_of()` method of the -#' `epi_archive` class, so if `x` is an `epi_archive` object, then: -#' ``` -#' epix_as_of(x, max_version = v) -#' ``` -#' is equivalent to: -#' ``` -#' x$as_of(max_version = v) -#' ``` -#' -#' Mutation and aliasing: `epix_as_of` and `$as_of` will not mutate the input -#' archives, but may in some edge cases alias parts of the inputs, so copy the -#' outputs if needed before using mutating operations like `data.table`'s `:=` -#' operator. Currently, the only situation where there is potentially aliasing -#' is of the `DT` in edge cases with `all_versions = TRUE`, but this may change -#' in the future. -#' #' @examples #' # warning message of data latency shown #' epix_as_of( -#' x = archive_cases_dv_subset, +#' archive_cases_dv_subset, #' max_version = max(archive_cases_dv_subset$DT$version) #' ) #' -#' @examples -#' #' range(archive_cases_dv_subset$DT$version) # 2020-06-02 -- 2021-12-01 #' #' epix_as_of( -#' x = archive_cases_dv_subset, +#' archive_cases_dv_subset, #' max_version = as.Date("2020-06-12") #' ) #' @@ -66,7 +47,7 @@ #' withCallingHandlers( #' { #' epix_as_of( -#' x = archive_cases_dv_subset, +#' archive_cases_dv_subset, #' max_version = max(archive_cases_dv_subset$DT$version) #' ) #' }, @@ -75,14 +56,77 @@ #' # Since R 4.0, there is a `globalCallingHandlers` function that can be used #' # to globally toggle these warnings. #' +#' @importFrom data.table between key #' @export epix_as_of <- function(x, max_version, min_time_value = -Inf, all_versions = FALSE) { assert_class(x, "epi_archive") - return(x$as_of(max_version, min_time_value, all_versions = all_versions)) + + other_keys <- setdiff( + key(x$DT), + c("geo_value", "time_value", "version") + ) + if (length(other_keys) == 0) other_keys <- NULL + + # Check a few things on max_version + if (!test_set_equal(class(max_version), class(x$DT$version))) { + cli_abort( + "`max_version` must have the same classes as `epi_archive$DT$version`." + ) + } + if (!test_set_equal(typeof(max_version), typeof(x$DT$version))) { + cli_abort( + "`max_version` must have the same types as `epi_archive$DT$version`." + ) + } + assert_scalar(max_version, na.ok = FALSE) + if (max_version > x$versions_end) { + cli_abort("`max_version` must be at most `epi_archive$versions_end`.") + } + assert_logical(all_versions, len = 1) + if (!is.na(x$clobberable_versions_start) && max_version >= x$clobberable_versions_start) { + cli_warn( + 'Getting data as of some recent version which could still be + overwritten (under routine circumstances) without assigning a new + version number (a.k.a. "clobbered"). Thus, the snapshot that we + produce here should not be expected to be reproducible later. See + `?epi_archive` for more info and `?epix_as_of` on how to muffle.', + class = "epiprocess__snapshot_as_of_clobberable_version" + ) + } + + # Filter by version and return + if (all_versions) { + # epi_archive is copied into result, so we can modify result directly + result <- epix_truncate_versions_after(x, max_version) + result$DT <- result$DT[time_value >= min_time_value, ] # nolint: object_usage_linter + return(result) + } + + # Make sure to use data.table ways of filtering and selecting + as_of_epi_df <- x$DT[time_value >= min_time_value & version <= max_version, ] %>% # nolint: object_usage_linter + unique( + by = c("geo_value", "time_value", other_keys), + fromLast = TRUE + ) %>% + tibble::as_tibble() %>% + dplyr::select(-"version") %>% + as_epi_df( + geo_type = x$geo_type, + time_type = x$time_type, + as_of = max_version, + additional_metadata = c( + x$additional_metadata, + list(other_keys = other_keys) + ) + ) + + return(as_of_epi_df) } -#' `epi_archive` with unobserved history filled in (won't mutate, might alias) + +#' Fill `epi_archive` unobserved history #' +#' @description #' Sometimes, due to upstream data pipeline issues, we have to work with a #' version history that isn't completely up to date, but with functions that #' expect archives that are completely up to date, or equally as up-to-date as @@ -90,13 +134,6 @@ epix_as_of <- function(x, max_version, min_time_value = -Inf, all_versions = FAL #' pretend that we've "observed" additional versions, filling in these versions #' with NAs or extrapolated values. #' -#' '`epix_fill_through_version` will not mutate its `x` argument, but its result -#' might alias fields of `x` (e.g., mutating the result's `DT` might mutate -#' `x$DT`). The R6 method variant, `x$fill_through_version`, will mutate `x` to -#' give the result, but might reseat its fields (e.g., references to the old -#' `x$DT` might not be updated by this function or subsequent operations on -#' `x`), and returns the updated `x` [invisibly][base::invisible]. -#' #' @param x An `epi_archive` #' @param fill_versions_end Length-1, same class&type as `x$version`: the #' version through which to fill in missing version history; this will be the @@ -110,31 +147,79 @@ epix_as_of <- function(x, max_version, min_time_value = -Inf, all_versions = FAL #' version history with the last version of each observation carried forward #' (LOCF), by leaving the update `$DT` alone (other `epi_archive` methods are #' based on LOCF). Default is `"na"`. +#' +#' @importFrom data.table copy ":=" +#' @importFrom rlang arg_match #' @return An `epi_archive` +#' @export epix_fill_through_version <- function(x, fill_versions_end, how = c("na", "locf")) { assert_class(x, "epi_archive") - # Enclosing parentheses drop the invisibility flag. See description above of - # potential mutation and aliasing behavior. - (x$clone()$fill_through_version(fill_versions_end, how = how)) + + validate_version_bound(fill_versions_end, x$DT, na_ok = FALSE) + how <- arg_match(how) + if (x$versions_end < fill_versions_end) { + new_dt <- switch(how, + "na" = { + # old DT + a version consisting of all NA observations + # immediately after the last currently/actually-observed + # version. Note that this NA-observation version must only be + # added if `epi_archive` is outdated. + nonversion_key_cols <- setdiff(key(x$DT), "version") + nonkey_cols <- setdiff(names(x$DT), key(x$DT)) + next_version_tag <- next_after(x$versions_end) + if (next_version_tag > fill_versions_end) { + cli_abort(sprintf(paste( + "Apparent problem with `next_after` method:", + "archive contained observations through version %s", + "and the next possible version was supposed to be %s,", + "but this appeared to jump from a version < %3$s", + "to one > %3$s, implying at least one version in between." + ), x$versions_end, next_version_tag, fill_versions_end)) + } + nonversion_key_vals_ever_recorded <- unique(x$DT, by = nonversion_key_cols) + # In edge cases, the `unique` result can alias the original + # DT; detect and copy if necessary: + if (identical(address(x$DT), address(nonversion_key_vals_ever_recorded))) { + nonversion_key_vals_ever_recorded <- data.table::copy(nonversion_key_vals_ever_recorded) + } + next_version_dt <- nonversion_key_vals_ever_recorded[ + , version := next_version_tag + ][ + # this makes the class of these columns logical (`NA` is a + # logical NA; we're relying on the rbind below to convert to + # the proper class&typeof) + , (nonkey_cols) := NA + ] + # full result DT: + setkeyv(rbind(x$DT, next_version_dt), key(x$DT))[] + }, + "locf" = { + # just the old DT; LOCF is built into other methods: + x$DT + } + ) + new_versions_end <- fill_versions_end + # Update `epi_archive` all at once with simple, error-free operations + + # return below: + x$DT <- new_dt + x$versions_end <- new_versions_end + } else { + # Already sufficiently up to date; nothing to do. + } + return(x) } + #' Merge two `epi_archive` objects #' #' Merges two `epi_archive`s that share a common `geo_value`, `time_value`, and -#' set of key columns. When they also share a common `versions_end`, -#' using `$as_of` on the result should be the same as using `$as_of` on `x` and -#' `y` individually, then performing a full join of the `DT`s on the non-version -#' key columns (potentially consolidating multiple warnings about clobberable -#' versions). If the `versions_end` values differ, the -#' `sync` parameter controls what is done. -#' -#' This function, [`epix_merge`], does not mutate its inputs and will not alias -#' either archive's `DT`, but may alias other fields; `x$merge` will overwrite -#' `x` with the result of the merge, reseating its `DT` and several other fields -#' (making them point to different objects), but avoiding mutation of the -#' contents of the old `DT` (only relevant if you have another reference to the -#' old `DT` in another object). +#' set of key columns. When they also share a common `versions_end`, using +#' `epix_as_of` on the result should be the same as using `epix_as_of` on `x` +#' and `y` individually, then performing a full join of the `DT`s on the +#' non-version key columns (potentially consolidating multiple warnings about +#' clobberable versions). If the `versions_end` values differ, the `sync` +#' parameter controls what is done. #' #' @param x,y Two `epi_archive` objects to join together. #' @param sync Optional; `"forbid"`, `"na"`, `"locf"`, or `"truncate"`; in the @@ -153,7 +238,7 @@ epix_fill_through_version <- function(x, fill_versions_end, #' use `min(x$versions_end, y$versions_end)` as the result's `versions_end`, #' and discard any rows containing update rows for later versions. #' @param compactify Optional; `TRUE`, `FALSE`, or `NULL`; should the result be -#' compactified? See [`as_epi_archive`] for an explanation of what this means. +#' compactified? See `as_epi_archive()` for an explanation of what this means. #' Default here is `TRUE`. #' @return the resulting `epi_archive` #' @@ -171,8 +256,6 @@ epix_fill_through_version <- function(x, fill_versions_end, #' as_epi_archive(compactify = TRUE) #' # merge results stored in a third object: #' xy <- epix_merge(x, y) -#' # vs. mutating x to hold the merge result: -#' x$merge(y) #' #' @importFrom data.table key set setkeyv #' @export @@ -207,13 +290,9 @@ epix_merge <- function(x, y, if (all(is.na(c(x$clobberable_versions_start, y$clobberable_versions_start)))) { NA # (any type of NA is fine here) } else { - min_na_rm(c(x$clobberable_versions_start, y$clobberable_versions_start)) + min(c(x$clobberable_versions_start, y$clobberable_versions_start), na.rm = TRUE) } - # The actual merge below may not succeed 100% of the time, so do this - # preprocessing using non-mutating (but potentially aliasing) functions. This - # approach potentially uses more memory, but won't leave behind a - # partially-mutated `x` on failure. if (sync == "forbid") { if (!identical(x$versions_end, y$versions_end)) { cli_abort(paste( @@ -386,64 +465,6 @@ epix_merge <- function(x, y, )) } -# Helpers for `group_by`: - -#' Make non-testing mock to get [`dplyr::dplyr_col_modify`] input -#' -#' A workaround for `dplyr:::mutate_cols` not being exported and directly -#' applying test mock libraries likely being impossible (due to mocking another -#' package's S3 generic or method). -#' -#' Use solely with a single call to the [`dplyr::mutate`] function and then -#' `destructure_col_modify_recorder_df`; other applicable operations from -#' [dplyr::dplyr_extending] have not been implemented. -#' -#' @param parent_df the "parent class" data frame to wrap -#' @return a `col_modify_recorder_df` -#' -#' @noRd -new_col_modify_recorder_df <- function(parent_df) { - assert_class(parent_df, "data.frame") - `class<-`(parent_df, c("col_modify_recorder_df", class(parent_df))) -} - -#' Extract unchanged parent-class data frame from a `new_col_modify_recorder_df` -#' -#' @param col_modify_recorder_df an instance of a `col_modify_recorder_df` -#' @return named list with elements `unchanged_parent_df`, `cols`; `cols` is the -#' input to [`dplyr::dplyr_col_modify`] that this class was designed to record -#' -#' @noRd -destructure_col_modify_recorder_df <- function(col_modify_recorder_df) { - assert_class(col_modify_recorder_df, "col_modify_recorder_df") - list( - unchanged_parent_df = col_modify_recorder_df %>% - `attr<-`("epiprocess::col_modify_recorder_df::cols", NULL) %>% - `class<-`(setdiff(class(.data), "col_modify_recorder_df")), - cols = attr(col_modify_recorder_df, - "epiprocess::col_modify_recorder_df::cols", - exact = TRUE - ) - ) -} - -#' `dplyr_col_modify` method that simply records the `cols` argument -#' -#' Must export S3 methods in R >= 4.0, even if they're only designed to be -#' package internals, and must import any corresponding upstream S3 generic -#' functions: -#' @importFrom dplyr dplyr_col_modify -#' @export -#' @noRd -dplyr_col_modify.col_modify_recorder_df <- function(data, cols) { - if (!is.null(attr(data, "epiprocess::col_modify_recorder_df::cols", exact = TRUE))) { - cli_abort("`col_modify_recorder_df` can only record `cols` once", - internal = TRUE - ) - } - attr(data, "epiprocess::col_modify_recorder_df::cols") <- cols - data -} #' A more detailed but restricted `mutate` for use in `group_by.epi_archive` #' @@ -512,7 +533,7 @@ epix_detailed_restricted_mutate <- function(.data, ...) { out_dt <- dplyr::dplyr_col_modify(in_tbl, col_modify_cols) %>% data.table::setattr("sorted", data.table::key(.data$DT)) %>% data.table::setDT(key = key(.data$DT)) - out_archive <- .data$clone() + out_archive <- .data %>% clone() out_archive$DT <- out_dt request_names <- names(col_modify_cols) return(list( @@ -532,163 +553,6 @@ epix_detailed_restricted_mutate <- function(.data, ...) { } } -#' `group_by` and related methods for `epi_archive`, `grouped_epi_archive` -#' -#' @param .data An `epi_archive` or `grouped_epi_archive` -#' @param ... Similar to [`dplyr::group_by`] (see "Details:" for edge cases); -#' * For `group_by`: unquoted variable name(s) or other -#' ["data masking"][dplyr::dplyr_data_masking] expression(s). It's possible to -#' use [`dplyr::mutate`]-like syntax here to calculate new columns on which to -#' perform grouping, but note that, if you are regrouping an already-grouped -#' `.data` object, the calculations will be carried out ignoring such grouping -#' (same as [in dplyr][dplyr::group_by]). -#' * For `ungroup`: either -#' * empty, in order to remove the grouping and output an `epi_archive`; or -#' * variable name(s) or other ["tidy-select"][dplyr::dplyr_tidy_select] -#' expression(s), in order to remove the matching variables from the list of -#' grouping variables, and output another `grouped_epi_archive`. -#' @param .add Boolean. If `FALSE`, the default, the output will be grouped by -#' the variable selection from `...` only; if `TRUE`, the output will be -#' grouped by the current grouping variables plus the variable selection from -#' `...`. -#' @param .drop As described in [`dplyr::group_by`]; determines treatment of -#' factor columns. -#' @param x For `groups` or `ungroup`: a `grouped_epi_archive`; for -#' `is_grouped_epi_archive`: any object -#' @param .tbl (For `group_by_drop_default`:) an `epi_archive` or -#' `grouped_epi_archive` (`epi_archive` dispatches to the S3 default method; -#' `grouped_epi_archive` dispatches its own S3 method) -#' -#' @details -#' -#' To match `dplyr`, `group_by` allows "data masking" (also referred to as -#' "tidy evaluation") expressions `...`, not just column names, in a way similar -#' to `mutate`. Note that replacing or removing key columns with these -#' expressions is disabled. -#' -#' `archive %>% group_by()` and other expressions that group or regroup by zero -#' columns (indicating that all rows should be treated as part of one large -#' group) will output a `grouped_epi_archive`, in order to enable the use of -#' `grouped_epi_archive` methods on the result. This is in slight contrast to -#' the same operations on tibbles and grouped tibbles, which will *not* output a -#' `grouped_df` in these circumstances. -#' -#' Using `group_by` with `.add=FALSE` to override the existing grouping is -#' disabled; instead, `ungroup` first then `group_by`. -#' -#' Mutation and aliasing: `group_by` tries to use a shallow copy of the `DT`, -#' introducing column-level aliasing between its input and its result. This -#' doesn't follow the general model for most `data.table` operations, which -#' seems to be that, given an nonaliased (i.e., unique) pointer to a -#' `data.table` object, its pointers to its columns should also be nonaliased. -#' If you mutate any of the columns of either the input or result, first ensure -#' that it is fine if columns of the other are also mutated, but do not rely on -#' such behavior to occur. Additionally, never perform mutation on the key -#' columns at all (except for strictly increasing transformations), as this will -#' invalidate sortedness assumptions about the rows. -#' -#' `group_by_drop_default` on (ungrouped) `epi_archive`s is expected to dispatch -#' to `group_by_drop_default.default` (but there is a dedicated method for -#' `grouped_epi_archive`s). -#' -#' @examples -#' -#' grouped_archive <- archive_cases_dv_subset %>% group_by(geo_value) -#' -#' # `print` for metadata and method listing: -#' grouped_archive %>% print() -#' -#' # The primary use for grouping is to perform a grouped `epix_slide`: -#' -#' archive_cases_dv_subset %>% -#' group_by(geo_value) %>% -#' epix_slide( -#' f = ~ mean(.x$case_rate_7d_av), -#' before = 2, -#' ref_time_values = as.Date("2020-06-11") + 0:2, -#' new_col_name = "case_rate_3d_av" -#' ) %>% -#' ungroup() -#' -#' # ----------------------------------------------------------------- -#' -#' # Advanced: some other features of dplyr grouping are implemented: -#' -#' library(dplyr) -#' toy_archive <- -#' tribble( -#' ~geo_value, ~age_group, ~time_value, ~version, ~value, -#' "us", "adult", "2000-01-01", "2000-01-02", 121, -#' "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) -#' "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) -#' "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) -#' ) %>% -#' mutate( -#' age_group = ordered(age_group, c("pediatric", "adult")), -#' time_value = as.Date(time_value), -#' version = as.Date(version) -#' ) %>% -#' as_epi_archive(other_keys = "age_group") -#' -#' # The following are equivalent: -#' toy_archive %>% group_by(geo_value, age_group) -#' toy_archive %>% -#' group_by(geo_value) %>% -#' group_by(age_group, .add = TRUE) -#' grouping_cols <- c("geo_value", "age_group") -#' toy_archive %>% group_by(across(all_of(grouping_cols))) -#' -#' # And these are equivalent: -#' toy_archive %>% group_by(geo_value) -#' toy_archive %>% -#' group_by(geo_value, age_group) %>% -#' ungroup(age_group) -#' -#' # To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): -#' toy_archive %>% -#' group_by(geo_value) %>% -#' groups() -#' -#' toy_archive %>% -#' group_by(geo_value, age_group, .drop = FALSE) %>% -#' epix_slide(f = ~ sum(.x$value), before = 20) %>% -#' ungroup() -#' -#' @importFrom dplyr group_by -#' @export -#' -#' @aliases grouped_epi_archive -group_by.epi_archive <- function(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) { - # `add` makes no difference; this is an ungrouped `epi_archive`. - detailed_mutate <- epix_detailed_restricted_mutate(.data, ...) - assert_logical(.drop) - if (!.drop) { - grouping_cols <- as.list(detailed_mutate[["archive"]][["DT"]])[detailed_mutate[["request_names"]]] - grouping_col_is_factor <- purrr::map_lgl(grouping_cols, is.factor) - # ^ Use `as.list` to try to avoid any possibility of a deep copy. - if (!any(grouping_col_is_factor)) { - cli_warn( - "`.drop=FALSE` but there are no factor grouping columns; - did you mean to convert one of the columns to a factor beforehand?", - class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" - ) - } else if (any(diff(grouping_col_is_factor) == -1L)) { - cli_warn( - "`.drop=FALSE` but there are one or more non-factor grouping columns listed - after a factor grouping column; this may produce groups with `NA`s for these columns; - see https://github.com/tidyverse/dplyr/issues/5369#issuecomment-683762553; - depending on how you want completion to work, you might instead want to convert - all grouping columns to factors beforehand, specify the non-factor grouping columns - first, or use `.drop=TRUE` and add a call to `tidyr::complete`.", - class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor" - ) - } - } - grouped_epi_archive$new(detailed_mutate[["archive"]], - detailed_mutate[["request_names"]], - drop = .drop - ) -} #' Slide a function over variables in an `epi_archive` or `grouped_epi_archive` #' @@ -742,8 +606,8 @@ group_by.epi_archive <- function(.data, ..., .add = FALSE, .drop = dplyr::group_ #' @param ref_time_values Reference time values / versions for sliding #' computations; each element of this vector serves both as the anchor point #' for the `time_value` window for the computation and the `max_version` -#' `as_of` which we fetch data in this window. If missing, then this will set -#' to a regularly-spaced sequence of values set to cover the range of +#' `epix_as_of` which we fetch data in this window. If missing, then this will +#' set to a regularly-spaced sequence of values set to cover the range of #' `version`s in the `DT` plus the `versions_end`; the spacing of values will #' be guessed (using the GCD of the skips between values). #' @param time_step Optional function used to define the meaning of one time @@ -823,30 +687,11 @@ group_by.epi_archive <- function(.data, ..., .add = FALSE, .drop = dplyr::group_ #' #' Furthermore, the current function can be considerably slower than #' `epi_slide()`, for two reasons: (1) it must repeatedly fetch -#' properly-versioned snapshots from the data archive (via its `as_of()` -#' method), and (2) it performs a "manual" sliding of sorts, and does not -#' benefit from the highly efficient `slider` package. For this reason, it -#' should never be used in place of `epi_slide()`, and only used when -#' version-aware sliding is necessary (as it its purpose). -#' -#' Finally, this is simply a wrapper around the `slide()` method of the -#' `epi_archive` and `grouped_epi_archive` classes, so if `x` is an -#' object of either of these classes, then: -#' ``` -#' epix_slide(x, new_var = comp(old_var), before = 119) -#' ``` -#' is equivalent to: -#' ``` -#' x$slide(new_var = comp(old_var), before = 119) -#' ``` -#' -#' Mutation and aliasing: `epix_slide` and `$slide` will not perform in-place -#' mutation of the input archives on their own. In some edge cases the inputs it -#' feeds to the slide computations may alias parts of the input archive, so copy -#' the slide computation inputs if needed before using mutating operations like -#' `data.table`'s `:=` operator. Similarly, in some edge cases, the output of -#' the slide operation may alias parts of the input archive, so similarly, make -#' sure to clone and/or copy appropriately before using in-place mutation. +#' properly-versioned snapshots from the data archive (via `epix_as_of()`), +#' and (2) it performs a "manual" sliding of sorts, and does not benefit from +#' the highly efficient `slider` package. For this reason, it should never be +#' used in place of `epi_slide()`, and only used when version-aware sliding is +#' necessary (as it its purpose). #' #' @examples #' library(dplyr) @@ -940,26 +785,52 @@ group_by.epi_archive <- function(.data, ..., .add = FALSE, .drop = dplyr::group_ #' filter(geo_value == "ca") %>% #' select(-geo_value) #' -#' @importFrom rlang enquo !!! #' @export -epix_slide <- function(x, f, ..., before, ref_time_values, - time_step, new_col_name = "slide_value", - as_list_col = FALSE, names_sep = "_", - all_versions = FALSE) { +epix_slide <- function( + x, + f, + ..., + before, + ref_time_values, + time_step, + new_col_name = "slide_value", + as_list_col = FALSE, + names_sep = "_", + all_versions = FALSE) { if (!is_epi_archive(x, grouped_okay = TRUE)) { cli_abort("`x` must be of class `epi_archive` or `grouped_epi_archive`.") } - return(x$slide(f, ..., - before = before, - ref_time_values = ref_time_values, - time_step = time_step, - new_col_name = new_col_name, - as_list_col = as_list_col, - names_sep = names_sep, + UseMethod("epix_slide") +} + + +#' @rdname epix_slide +#' @export +epix_slide.epi_archive <- function(x, f, ..., before, ref_time_values, + time_step, new_col_name = "slide_value", + as_list_col = FALSE, names_sep = "_", + all_versions = FALSE) { + # For an "ungrouped" slide, treat all rows as belonging to one big + # group (group by 0 vars), like `dplyr::summarize`, and let the + # resulting `grouped_epi_archive` handle the slide: + epix_slide( + group_by(x), + f, + ..., + before = before, ref_time_values = ref_time_values, + time_step = time_step, new_col_name = new_col_name, + as_list_col = as_list_col, names_sep = names_sep, all_versions = all_versions - )) + ) %>% + # We want a slide on ungrouped archives to output something + # ungrouped, rather than retaining the trivial (0-variable) + # grouping applied above. So we `ungroup()`. However, the current + # `dplyr` implementation automatically ignores/drops trivial + # groupings, so this is just a no-op for now. + ungroup() } + #' Default value for `ref_time_values` in an `epix_slide` #' #' @noRd @@ -969,16 +840,14 @@ epix_slide_ref_time_values_default <- function(ea) { return(ref_time_values) } + #' Filter an `epi_archive` object to keep only older versions #' #' Generates a filtered `epi_archive` from an `epi_archive` object, keeping #' only rows with `version` falling on or before a specified date. #' -#' @param x An `epi_archive` object -#' @param max_version Time value specifying the max version to permit in the -#' filtered archive. That is, the output archive will comprise rows of the -#' current archive data having `version` less than or equal to the -#' specified `max_version` +#' @param x An `epi_archive` object. +#' @param max_version The latest version to include in the archive. #' @return An `epi_archive` object #' #' @export @@ -986,8 +855,89 @@ epix_truncate_versions_after <- function(x, max_version) { UseMethod("epix_truncate_versions_after") } + +#' @rdname epix_truncate_versions_after #' @export epix_truncate_versions_after.epi_archive <- function(x, max_version) { - return((x$clone()$truncate_versions_after(max_version))) - # ^ second set of parens drops invisibility + if (!test_set_equal(class(max_version), class(x$DT$version))) { + cli_abort("`max_version` must have the same classes as `epi_archive$DT$version`.") + } + if (!test_set_equal(typeof(max_version), typeof(x$DT$version))) { + cli_abort("`max_version` must have the same types as `epi_archive$DT$version`.") + } + assert_scalar(max_version, na.ok = FALSE) + if (max_version > x$versions_end) { + cli_abort("`max_version` must be at most `epi_archive$versions_end`.") + } + x$DT <- x$DT[x$DT$version <= max_version, colnames(x$DT), with = FALSE] + # (^ this filter operation seems to always copy the DT, even if it + # keeps every entry; we don't guarantee this behavior in + # documentation, though, so we could change to alias in this case) + if (!is.na(x$clobberable_versions_start) && x$clobberable_versions_start > max_version) { + x$clobberable_versions_start <- NA + } + x$versions_end <- max_version + return(x) +} + + +# Helpers for `group_by`: + +#' Make non-testing mock to get [`dplyr::dplyr_col_modify`] input +#' +#' A workaround for `dplyr:::mutate_cols` not being exported and directly +#' applying test mock libraries likely being impossible (due to mocking another +#' package's S3 generic or method). +#' +#' Use solely with a single call to the [`dplyr::mutate`] function and then +#' `destructure_col_modify_recorder_df`; other applicable operations from +#' [dplyr::dplyr_extending] have not been implemented. +#' +#' @param parent_df the "parent class" data frame to wrap +#' @return a `col_modify_recorder_df` +#' +#' @noRd +new_col_modify_recorder_df <- function(parent_df) { + assert_class(parent_df, "data.frame") + `class<-`(parent_df, c("col_modify_recorder_df", class(parent_df))) +} + + +#' Extract unchanged parent-class data frame from a `new_col_modify_recorder_df` +#' +#' @param col_modify_recorder_df an instance of a `col_modify_recorder_df` +#' @return named list with elements `unchanged_parent_df`, `cols`; `cols` is the +#' input to [`dplyr::dplyr_col_modify`] that this class was designed to record +#' +#' @noRd +destructure_col_modify_recorder_df <- function(col_modify_recorder_df) { + assert_class(col_modify_recorder_df, "col_modify_recorder_df") + list( + unchanged_parent_df = col_modify_recorder_df %>% + `attr<-`("epiprocess::col_modify_recorder_df::cols", NULL) %>% + `class<-`(setdiff(class(.data), "col_modify_recorder_df")), + cols = attr(col_modify_recorder_df, + "epiprocess::col_modify_recorder_df::cols", + exact = TRUE + ) + ) +} + + +#' `dplyr_col_modify` method that simply records the `cols` argument +#' +#' Must export S3 methods in R >= 4.0, even if they're only designed to be +#' package internals, and must import any corresponding upstream S3 generic +#' functions: +#' @importFrom dplyr dplyr_col_modify +#' @export +#' @noRd +dplyr_col_modify.col_modify_recorder_df <- function(data, cols) { + if (!is.null(attr(data, "epiprocess::col_modify_recorder_df::cols", exact = TRUE))) { + cli_abort("`col_modify_recorder_df` can only record `cols` once", + internal = TRUE + ) + } + attr(data, "epiprocess::col_modify_recorder_df::cols") <- cols + data } diff --git a/R/methods-epi_archive_new.R b/R/methods-epi_archive_new.R deleted file mode 100644 index 3ce39afc..00000000 --- a/R/methods-epi_archive_new.R +++ /dev/null @@ -1,826 +0,0 @@ -#' Generate a snapshot from an `epi_archive` object -#' -#' Generates a snapshot in `epi_df` format from an `epi_archive` object, as of a -#' given version. See the [archive -#' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for -#' examples. -#' -#' @param x An `epi_archive` object -#' @param max_version Time value specifying the max version to permit in the -#' snapshot. That is, the snapshot will comprise the unique rows of the -#' current archive data that represent the most up-to-date signal values, as -#' of the specified `max_version` (and whose time values are at least -#' `min_time_value`.) -#' @param min_time_value Time value specifying the min time value to permit in -#' the snapshot. Default is `-Inf`, which effectively means that there is no -#' minimum considered. -#' @param all_versions If `all_versions = TRUE`, then the output will be in -#' `epi_archive` format, and contain rows in the specified `time_value` range -#' having `version <= max_version`. The resulting object will cover a -#' potentially narrower `version` and `time_value` range than `x`, depending -#' on user-provided arguments. Otherwise, there will be one row in the output -#' for the `max_version` of each `time_value`. Default is `FALSE`. -#' @return An `epi_df` object. -#' -#' @details This is simply a wrapper around the `as_of()` method of the -#' `epi_archive` class, so if `x` is an `epi_archive` object, then: -#' ``` -#' epix_as_of(x, max_version = v) -#' ``` -#' is equivalent to: -#' ``` -#' x$as_of(max_version = v) -#' ``` -#' -#' Mutation and aliasing: `epix_as_of` and `$as_of` will not mutate the input -#' archives, but may in some edge cases alias parts of the inputs, so copy the -#' outputs if needed before using mutating operations like `data.table`'s `:=` -#' operator. Currently, the only situation where there is potentially aliasing -#' is of the `DT` in edge cases with `all_versions = TRUE`, but this may change -#' in the future. -#' -#' @examples -#' # warning message of data latency shown -#' epix_as_of2( -#' archive_cases_dv_subset_2, -#' max_version = max(archive_cases_dv_subset_2$DT$version) -#' ) -#' -#' range(archive_cases_dv_subset_2$DT$version) # 2020-06-02 -- 2021-12-01 -#' -#' epix_as_of2( -#' archive_cases_dv_subset_2, -#' max_version = as.Date("2020-06-12") -#' ) -#' -#' # When fetching a snapshot as of the latest version with update data in the -#' # archive, a warning is issued by default, as this update data might not yet -#' # be finalized (for example, if data versions are labeled with dates, these -#' # versions might be overwritten throughout the corresponding days with -#' # additional data or "hotfixes" of erroroneous data; when we build an archive -#' # based on database queries, the latest available update might still be -#' # subject to change, but previous versions should be finalized). We can -#' # muffle such warnings with the following pattern: -#' withCallingHandlers( -#' { -#' epix_as_of2( -#' archive_cases_dv_subset_2, -#' max_version = max(archive_cases_dv_subset_2$DT$version) -#' ) -#' }, -#' epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") -#' ) -#' # Since R 4.0, there is a `globalCallingHandlers` function that can be used -#' # to globally toggle these warnings. -#' -#' @export -epix_as_of2 <- function(epi_archive, max_version, min_time_value = -Inf, all_versions = FALSE) { - assert_class(epi_archive, "epi_archive2") - return(as_of(epi_archive, max_version, min_time_value, all_versions = all_versions)) -} - -#' `epi_archive` with unobserved history filled in (won't mutate, might alias) -#' -#' Sometimes, due to upstream data pipeline issues, we have to work with a -#' version history that isn't completely up to date, but with functions that -#' expect archives that are completely up to date, or equally as up-to-date as -#' another archive. This function provides one way to approach such mismatches: -#' pretend that we've "observed" additional versions, filling in these versions -#' with NAs or extrapolated values. -#' -#' '`epix_fill_through_version` will not mutate its `x` argument, but its result -#' might alias fields of `x` (e.g., mutating the result's `DT` might mutate -#' `x$DT`). The R6 method variant, `x$fill_through_version`, will mutate `x` to -#' give the result, but might reseat its fields (e.g., references to the old -#' `x$DT` might not be updated by this function or subsequent operations on -#' `x`), and returns the updated `x` [invisibly][base::invisible]. -#' -#' @param x An `epi_archive` -#' @param fill_versions_end Length-1, same class&type as `x$version`: the -#' version through which to fill in missing version history; this will be the -#' result's `$versions_end` unless it already had a later -#' `$versions_end`. -#' @param how Optional; `"na"` or `"locf"`: `"na"` will fill in any missing -#' required version history with `NA`s, by inserting (if necessary) an update -#' immediately after the current `$versions_end` that revises all -#' existing measurements to be `NA` (this is only supported for `version` -#' classes with a `next_after` implementation); `"locf"` will fill in missing -#' version history with the last version of each observation carried forward -#' (LOCF), by leaving the update `$DT` alone (other `epi_archive` methods are -#' based on LOCF). Default is `"na"`. -#' @return An `epi_archive` -epix_fill_through_version2 <- function(epi_archive, fill_versions_end, - how = c("na", "locf")) { - assert_class(epi_archive, "epi_archive2") - cloned_epi_archive <- clone(epi_archive) - # Enclosing parentheses drop the invisibility flag. See description above of - # potential mutation and aliasing behavior. - (fill_through_version(cloned_epi_archive, fill_versions_end, how = how)) -} - -#' Merge two `epi_archive` objects -#' -#' Merges two `epi_archive`s that share a common `geo_value`, `time_value`, and -#' set of key columns. When they also share a common `versions_end`, -#' using `$as_of` on the result should be the same as using `$as_of` on `x` and -#' `y` individually, then performing a full join of the `DT`s on the non-version -#' key columns (potentially consolidating multiple warnings about clobberable -#' versions). If the `versions_end` values differ, the -#' `sync` parameter controls what is done. -#' -#' This function, [`epix_merge`], does not mutate its inputs and will not alias -#' either archive's `DT`, but may alias other fields; `x$merge` will overwrite -#' `x` with the result of the merge, reseating its `DT` and several other fields -#' (making them point to different objects), but avoiding mutation of the -#' contents of the old `DT` (only relevant if you have another reference to the -#' old `DT` in another object). -#' -#' @param x,y Two `epi_archive` objects to join together. -#' @param sync Optional; `"forbid"`, `"na"`, `"locf"`, or `"truncate"`; in the -#' case that `x$versions_end` doesn't match `y$versions_end`, what do we do?: -#' `"forbid"`: emit an error; "na": use `max(x$versions_end, y$versions_end)` -#' as the result's `versions_end`, but ensure that, if we request a snapshot -#' as of a version after `min(x$versions_end, y$versions_end)`, the -#' observation columns from the less up-to-date archive will be all NAs (i.e., -#' imagine there was an update immediately after its `versions_end` which -#' revised all observations to be `NA`); `"locf"`: use `max(x$versions_end, -#' y$versions_end)` as the result's `versions_end`, allowing the last version -#' of each observation to be carried forward to extrapolate unavailable -#' versions for the less up-to-date input archive (i.e., imagining that in the -#' less up-to-date archive's data set remained unchanged between its actual -#' `versions_end` and the other archive's `versions_end`); or `"truncate"`: -#' use `min(x$versions_end, y$versions_end)` as the result's `versions_end`, -#' and discard any rows containing update rows for later versions. -#' @param compactify Optional; `TRUE`, `FALSE`, or `NULL`; should the result be -#' compactified? See [`as_epi_archive`] for an explanation of what this means. -#' Default here is `TRUE`. -#' @return the resulting `epi_archive` -#' -#' @details In all cases, `additional_metadata` will be an empty list, and -#' `clobberable_versions_start` will be set to the earliest version that could -#' be clobbered in either input archive. -#' -#' @examples -#' # create two example epi_archive datasets -#' x <- archive_cases_dv_subset_2$DT %>% -#' dplyr::select(geo_value, time_value, version, case_rate_7d_av) %>% -#' as_epi_archive2(compactify = TRUE) -#' y <- archive_cases_dv_subset_2$DT %>% -#' dplyr::select(geo_value, time_value, version, percent_cli) %>% -#' as_epi_archive2(compactify = TRUE) -#' # merge results stored in a third object: -#' xy <- epix_merge2(x, y) -#' -#' @importFrom data.table key set setkeyv -#' @export -epix_merge2 <- function(x, y, - sync = c("forbid", "na", "locf", "truncate"), - compactify = TRUE) { - assert_class(x, "epi_archive2") - assert_class(y, "epi_archive2") - sync <- rlang::arg_match(sync) - - if (!identical(x$geo_type, y$geo_type)) { - cli_abort("`x` and `y` must have the same `$geo_type`") - } - - if (!identical(x$time_type, y$time_type)) { - cli_abort("`x` and `y` must have the same `$time_type`") - } - - if (length(x$additional_metadata) != 0L) { - cli_warn("x$additional_metadata won't appear in merge result", - class = "epiprocess__epix_merge_ignores_additional_metadata" - ) - } - if (length(y$additional_metadata) != 0L) { - cli_warn("y$additional_metadata won't appear in merge result", - class = "epiprocess__epix_merge_ignores_additional_metadata" - ) - } - result_additional_metadata <- list() - - result_clobberable_versions_start <- - if (all(is.na(c(x$clobberable_versions_start, y$clobberable_versions_start)))) { - NA # (any type of NA is fine here) - } else { - min_na_rm(c(x$clobberable_versions_start, y$clobberable_versions_start)) - } - - # The actual merge below may not succeed 100% of the time, so do this - # preprocessing using non-mutating (but potentially aliasing) functions. This - # approach potentially uses more memory, but won't leave behind a - # partially-mutated `x` on failure. - if (sync == "forbid") { - if (!identical(x$versions_end, y$versions_end)) { - cli_abort(paste( - "`x` and `y` were not equally up to date version-wise:", - "`x$versions_end` was not identical to `y$versions_end`;", - "either ensure that `x` and `y` are equally up to date before merging,", - "or specify how to deal with this using `sync`" - ), class = "epiprocess__epix_merge_unresolved_sync") - } else { - new_versions_end <- x$versions_end - x_DT <- x$DT - y_DT <- y$DT - } - } else if (sync %in% c("na", "locf")) { - new_versions_end <- max(x$versions_end, y$versions_end) - x_DT <- epix_fill_through_version2(x, new_versions_end, sync)$DT - y_DT <- epix_fill_through_version2(y, new_versions_end, sync)$DT - } else if (sync == "truncate") { - new_versions_end <- min(x$versions_end, y$versions_end) - x_DT <- x$DT[x[["DT"]][["version"]] <= new_versions_end, names(x$DT), with = FALSE] - y_DT <- y$DT[y[["DT"]][["version"]] <= new_versions_end, names(y$DT), with = FALSE] - } else { - cli_abort("unimplemented") - } - - # key(x_DT) should be the same as key(x$DT) and key(y_DT) should be the same - # as key(y$DT). Below, we only use {x,y}_DT in the code (making it easier to - # split the code into separate functions if we wish), but still refer to - # {x,y}$DT in the error messages (further relying on this assumption). - # - # Check&ensure that the above assumption; if it didn't already hold, we likely - # have a bug in the preprocessing, a weird/invalid archive as input, and/or a - # data.table version with different semantics (which may break other parts of - # our code). - x_DT_key_as_expected <- identical(key(x$DT), key(x_DT)) - y_DT_key_as_expected <- identical(key(y$DT), key(y_DT)) - if (!x_DT_key_as_expected || !y_DT_key_as_expected) { - cli_warn(" - `epiprocess` internal warning (please report): pre-processing for - epix_merge unexpectedly resulted in an intermediate data table (or - tables) with a different key than the corresponding input archive. - Manually setting intermediate data table keys to the expected values. - ", internal = TRUE) - setkeyv(x_DT, key(x$DT)) - setkeyv(y_DT, key(y$DT)) - } - # Without some sort of annotations of what various columns represent, we can't - # do something that makes sense when merging archives with mismatched keys. - # E.g., even if we assume extra keys represent demographic breakdowns, a - # sensible default treatment of count-type and rate-type value columns would - # differ. - if (!identical(sort(key(x_DT)), sort(key(y_DT)))) { - cli_abort(" - The archives must have the same set of key column names; if the - key columns represent the same things, just with different - names, please retry after manually renaming to match; if they - represent different things (e.g., x has an age breakdown - but y does not), please retry after processing them to share - the same key (e.g., by summarizing x to remove the age breakdown, - or by applying a static age breakdown to y). - ", class = "epiprocess__epix_merge_x_y_must_have_same_key_set") - } - # `by` cols = result (and each input's) `key` cols, and determine - # the row set, determined using a full join via `merge` - # - # non-`by` cols = "value"-ish cols, and are looked up with last - # version carried forward via rolling joins - by <- key(x_DT) # = some perm of key(y_DT) - if (!all(c("geo_value", "time_value", "version") %in% key(x_DT))) { - cli_abort('Invalid `by`; `by` is currently set to the common `key` of - the two archives, and is expected to contain - "geo_value", "time_value", and "version".', - class = "epiprocess__epi_archive_must_have_required_key_cols" - ) - } - if (length(by) < 1L || utils::tail(by, 1L) != "version") { - cli_abort('Invalid `by`; `by` is currently set to the common `key` of - the two archives, and is expected to have a "version" as - the last key col.', - class = "epiprocess__epi_archive_must_have_version_at_end_of_key" - ) - } - x_nonby_colnames <- setdiff(names(x_DT), by) - y_nonby_colnames <- setdiff(names(y_DT), by) - if (length(intersect(x_nonby_colnames, y_nonby_colnames)) != 0L) { - cli_abort(" - `x` and `y` DTs have overlapping non-by column names; - this is currently not supported; please manually fix up first: - any overlapping columns that can are key-like should be - incorporated into the key, and other columns should be renamed. - ", class = "epiprocess__epix_merge_x_y_must_not_have_overlapping_nonby_colnames") - } - x_by_vals <- x_DT[, by, with = FALSE] - if (anyDuplicated(x_by_vals) != 0L) { - cli_abort(" - The `by` columns must uniquely determine rows of `x$DT`; - the `by` is currently set to the common `key` of the two - archives, so this can be resolved by adding key-like columns - to `x`'s key (to get a unique key). - ", class = "epiprocess__epix_merge_by_cols_must_act_as_unique_key") - } - y_by_vals <- y_DT[, by, with = FALSE] - if (anyDuplicated(y_by_vals) != 0L) { - cli_abort(" - The `by` columns must uniquely determine rows of `y$DT`; - the `by` is currently set to the common `key` of the two - archives, so this can be resolved by adding key-like columns - to `y`'s key (to get a unique key). - ", class = "epiprocess__epix_merge_by_cols_must_act_as_unique_key") - } - result_DT <- merge(x_by_vals, y_by_vals, - by = by, - # We must have `all=TRUE` or we may skip updates - # from x and/or y and corrupt the history - all = TRUE, - # We don't want Cartesian products, but the - # by-is-unique-key check above already ensures - # this. (Note that `allow.cartesian=FALSE` doesn't - # actually catch all Cartesian products anyway.) - # Disable superfluous check: - allow.cartesian = TRUE - ) - set( - result_DT, , x_nonby_colnames, - x_DT[result_DT[, by, with = FALSE], x_nonby_colnames, - with = FALSE, - # It's good practice to specify `on`, and we must - # explicitly specify `on` if there's a potential key vs. - # by order mismatch (not possible currently for x - # with by = key(x$DT), but possible for y): - on = by, - # last version carried forward: - roll = TRUE, - # requesting non-version key that doesn't exist in the other archive, - # or before its first version, should result in NA - nomatch = NA, - # see note on `allow.cartesian` above; currently have a - # similar story here. - allow.cartesian = TRUE - ] - ) - set( - result_DT, , y_nonby_colnames, - y_DT[result_DT[, by, with = FALSE], y_nonby_colnames, - with = FALSE, - on = by, - roll = TRUE, - nomatch = NA, - allow.cartesian = TRUE - ] - ) - # The key could be unset in case of a key vs. by order mismatch as - # noted above. Ensure that we keep it: - setkeyv(result_DT, by) - - return(as_epi_archive2( - result_DT[], # clear data.table internal invisibility flag if set - geo_type = x$geo_type, - time_type = x$time_type, - other_keys = setdiff(key(result_DT), c("geo_value", "time_value", "version")), - additional_metadata = result_additional_metadata, - # It'd probably be better to pre-compactify before the merge, and might be - # guaranteed not to be necessary to compactify the merge result if the - # inputs are already compactified, but at time of writing we don't have - # compactify in its own method or field, and it seems like it should be - # pretty fast anyway. - compactify = compactify, - clobberable_versions_start = result_clobberable_versions_start, - versions_end = new_versions_end - )) -} - -# Helpers for `group_by`: - -#' Make non-testing mock to get [`dplyr::dplyr_col_modify`] input -#' -#' A workaround for `dplyr:::mutate_cols` not being exported and directly -#' applying test mock libraries likely being impossible (due to mocking another -#' package's S3 generic or method). -#' -#' Use solely with a single call to the [`dplyr::mutate`] function and then -#' `destructure_col_modify_recorder_df`; other applicable operations from -#' [dplyr::dplyr_extending] have not been implemented. -#' -#' @param parent_df the "parent class" data frame to wrap -#' @return a `col_modify_recorder_df` -#' -#' @noRd -new_col_modify_recorder_df <- function(parent_df) { - assert_class(parent_df, "data.frame") - `class<-`(parent_df, c("col_modify_recorder_df", class(parent_df))) -} - -#' Extract unchanged parent-class data frame from a `new_col_modify_recorder_df` -#' -#' @param col_modify_recorder_df an instance of a `col_modify_recorder_df` -#' @return named list with elements `unchanged_parent_df`, `cols`; `cols` is the -#' input to [`dplyr::dplyr_col_modify`] that this class was designed to record -#' -#' @noRd -destructure_col_modify_recorder_df <- function(col_modify_recorder_df) { - assert_class(col_modify_recorder_df, "col_modify_recorder_df") - list( - unchanged_parent_df = col_modify_recorder_df %>% - `attr<-`("epiprocess::col_modify_recorder_df::cols", NULL) %>% - `class<-`(setdiff(class(.), "col_modify_recorder_df")), - cols = attr(col_modify_recorder_df, - "epiprocess::col_modify_recorder_df::cols", - exact = TRUE - ) - ) -} - -#' `dplyr_col_modify` method that simply records the `cols` argument -#' -#' Must export S3 methods in R >= 4.0, even if they're only designed to be -#' package internals, and must import any corresponding upstream S3 generic -#' functions: -#' @importFrom dplyr dplyr_col_modify -#' @export -#' @noRd -dplyr_col_modify.col_modify_recorder_df <- function(data, cols) { - if (!is.null(attr(data, "epiprocess::col_modify_recorder_df::cols", exact = TRUE))) { - cli_abort("`col_modify_recorder_df` can only record `cols` once", - internal = TRUE - ) - } - attr(data, "epiprocess::col_modify_recorder_df::cols") <- cols - data -} - -#' A more detailed but restricted `mutate` for use in `group_by.epi_archive` -#' -#' More detailed: provides the names of the "requested" columns in addition to -#' the output expected from a regular `mutate` method. -#' -#' Restricted: doesn't allow replacing or removing key cols, where a sort is -#' potentially required at best and what the output key should be is unclear at -#' worst. (The originally expected restriction was that the `mutate` parameters -#' not present in `group_by` would not be recognized, but the current -#' implementation just lets `mutate` handle these even anyway, even if they're -#' not part of the regular `group_by` parameters; these arguments would have to -#' be passed by names with dot prefixes, so just hope that the user means to use -#' them here if provided.) -#' -#' This can introduce column-level aliasing in `data.table`s, which isn't really -#' intended in the `data.table` user model but we can make it part of our user -#' model (see -#' https://stackoverflow.com/questions/45925482/make-a-shallow-copy-in-data-table -#' and links). -#' -#' Don't export this without cleaning up language of "mutate" as in side effects -#' vs. "mutate" as in `dplyr::mutate`. -#' @noRd -epix_detailed_restricted_mutate2 <- function(.data, ...) { - # We don't want to directly use `dplyr::mutate` on the `$DT`, as: - # - `mutate` behavior, including the output class, changes depending on - # whether `dtplyr` < 1.3.0 is loaded and would require post-processing - # - behavior with `dtplyr` isn't fully compatible - # - it doesn't give the desired details, and `rlang::exprs_auto_name` does not - # appropriately handle the `= NULL` and `= ` tidyeval cases - # Instead: - # - Use `as.list` to get a shallow copy (undocumented, but apparently - # intended, behavior), then `as_tibble` (also shallow, given a list) to get - # back to something that will use `dplyr`'s included `mutate` method(s), - # then convert this using shallow operations into a `data.table`. - # - Use `col_modify_recorder_df` to get the desired details. - in_tbl <- tibble::as_tibble(as.list(.data$DT), .name_repair = "minimal") - col_modify_cols <- - destructure_col_modify_recorder_df( - mutate(new_col_modify_recorder_df(in_tbl), ...) - )[["cols"]] - invalidated_key_col_is <- - which(purrr::map_lgl(key(.data$DT), function(key_colname) { - key_colname %in% names(col_modify_cols) && - !rlang::is_reference(in_tbl[[key_colname]], col_modify_cols[[key_colname]]) - })) - if (length(invalidated_key_col_is) != 0L) { - rlang::abort(paste_lines(c( - "Key columns must not be replaced or removed.", - wrap_varnames(key(.data$DT)[invalidated_key_col_is], - initial = "Flagged key cols: " - ) - ))) - } else { - # Have `dplyr` do the `dplyr_col_modify`, keeping the column-level-aliasing - # and must-copy-on-write-if-refcount-more-than-1 model, obtaining a tibble, - # then convert it into a `data.table`. The key should still be valid - # (assuming that the user did not explicitly alter `key(.data$DT)` or the - # columns by reference somehow within `...` tidyeval-style computations, or - # trigger refcount-1 alterations due to still having >1 refcounts on the - # columns), set the "sorted" attribute accordingly to prevent attempted - # sorting (including potential extra copies) or sortedness checking, then - # `setDT` (rather than `as.data.table`, in order to prevent column copying - # to establish ownership according to `data.table`'s memory model). - out_DT <- dplyr::dplyr_col_modify(in_tbl, col_modify_cols) %>% - data.table::setattr("sorted", data.table::key(.data$DT)) %>% - data.table::setDT(key = key(.data$DT)) - out_archive <- clone(.data) - out_archive$DT <- out_DT - request_names <- names(col_modify_cols) - return(list( - archive = out_archive, - request_names = request_names - )) - # (We might also consider special-casing when `mutate` hands back something - # equivalent (in some sense) to the input (probably only encountered when - # we're dealing with `group_by`), and using just `$DT`, not a shallow copy, - # in the result, primarily in order to hedge against `as.list` or `setDT` - # changing their behavior and generating deep copies somehow. This could - # also prevent storage, and perhaps also generation, of shallow copies, but - # this seems unlikely to be a major gain unless it helps enable some - # in-place modifications of refcount-1 columns (although detecting this case - # seems to be common across `group_by` implementations; maybe there is - # something there).) - } -} - - -#' Slide a function over variables in an `epi_archive` or `grouped_epi_archive` -#' -#' Slides a given function over variables in an `epi_archive` object. This -#' behaves similarly to `epi_slide()`, with the key exception that it is -#' version-aware: the sliding computation at any given reference time t is -#' performed on **data that would have been available as of t**. See the -#' [archive -#' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for -#' examples. -#' -#' @param x An [`epi_archive`] or [`grouped_epi_archive`] object. If ungrouped, -#' all data in `x` will be treated as part of a single data group. -#' @param f Function, formula, or missing; together with `...` specifies the -#' computation to slide. To "slide" means to apply a computation over a -#' sliding (a.k.a. "rolling") time window for each data group. The window is -#' determined by the `before` parameter described below. One time step is -#' typically one day or one week; see [`epi_slide`] details for more -#' explanation. If a function, `f` must take an `epi_df` with the same -#' column names as the archive's `DT`, minus the `version` column; followed -#' by a one-row tibble containing the values of the grouping variables for -#' the associated group; followed by a reference time value, usually as a -#' `Date` object; followed by any number of named arguments. If a formula, -#' `f` can operate directly on columns accessed via `.x$var` or `.$var`, as -#' in `~ mean (.x$var)` to compute a mean of a column `var` for each -#' group-`ref_time_value` combination. The group key can be accessed via -#' `.y` or `.group_key`, and the reference time value can be accessed via -#' `.z` or `.ref_time_value`. If `f` is missing, then `...` will specify the -#' computation. -#' @param ... Additional arguments to pass to the function or formula specified -#' via `f`. Alternatively, if `f` is missing, then `...` is interpreted as an -#' expression for tidy evaluation; in addition to referring to columns -#' directly by name, the expression has access to `.data` and `.env` pronouns -#' as in `dplyr` verbs, and can also refer to the `.group_key` and -#' `.ref_time_value`. See details of [`epi_slide`]. -#' @param before How far `before` each `ref_time_value` should the sliding -#' window extend? If provided, should be a single, non-NA, -#' [integer-compatible][vctrs::vec_cast] number of time steps. This window -#' endpoint is inclusive. For example, if `before = 7`, and one time step is -#' one day, then to produce a value for a `ref_time_value` of January 8, we -#' apply the given function or formula to data (for each group present) with -#' `time_value`s from January 1 onward, as they were reported on January 8. -#' For typical disease surveillance sources, this will not include any data -#' with a `time_value` of January 8, and, depending on the amount of reporting -#' latency, may not include January 7 or even earlier `time_value`s. (If -#' instead the archive were to hold nowcasts instead of regular surveillance -#' data, then we would indeed expect data for `time_value` January 8. If it -#' were to hold forecasts, then we would expect data for `time_value`s after -#' January 8, and the sliding window would extend as far after each -#' `ref_time_value` as needed to include all such `time_value`s.) -#' @param ref_time_values Reference time values / versions for sliding -#' computations; each element of this vector serves both as the anchor point -#' for the `time_value` window for the computation and the `max_version` -#' `as_of` which we fetch data in this window. If missing, then this will set -#' to a regularly-spaced sequence of values set to cover the range of -#' `version`s in the `DT` plus the `versions_end`; the spacing of values will -#' be guessed (using the GCD of the skips between values). -#' @param time_step Optional function used to define the meaning of one time -#' step, which if specified, overrides the default choice based on the -#' `time_value` column. This function must take a positive integer and return -#' an object of class `lubridate::period`. For example, we can use `time_step -#' = lubridate::hours` in order to set the time step to be one hour (this -#' would only be meaningful if `time_value` is of class `POSIXct`). -#' @param new_col_name String indicating the name of the new column that will -#' contain the derivative values. Default is "slide_value"; note that setting -#' `new_col_name` equal to an existing column name will overwrite this column. -#' @param as_list_col Should the slide results be held in a list column, or be -#' [unchopped][tidyr::unchop]/[unnested][tidyr::unnest]? Default is `FALSE`, -#' in which case a list object returned by `f` would be unnested (using -#' [`tidyr::unnest()`]), and, if the slide computations output data frames, -#' the names of the resulting columns are given by prepending `new_col_name` -#' to the names of the list elements. -#' @param names_sep String specifying the separator to use in `tidyr::unnest()` -#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix -#' from `new_col_name` entirely. -#' @param all_versions (Not the same as `all_rows` parameter of `epi_slide`.) If -#' `all_versions = TRUE`, then `f` will be passed the version history (all -#' `version <= ref_time_value`) for rows having `time_value` between -#' `ref_time_value - before` and `ref_time_value`. Otherwise, `f` will be -#' passed only the most recent `version` for every unique `time_value`. -#' Default is `FALSE`. -#' @return A tibble whose columns are: the grouping variables, `time_value`, -#' containing the reference time values for the slide computation, and a -#' column named according to the `new_col_name` argument, containing the slide -#' values. -#' -#' @details A few key distinctions between the current function and `epi_slide()`: -#' 1. In `f` functions for `epix_slide`, one should not assume that the input -#' data to contain any rows with `time_value` matching the computation's -#' `ref_time_value` (accessible via `attributes()$metadata$as_of`); for -#' typical epidemiological surveillance data, observations pertaining to a -#' particular time period (`time_value`) are first reported `as_of` some -#' instant after that time period has ended. -#' 2. `epix_slide()` doesn't accept an `after` argument; its windows extend -#' from `before` time steps before a given `ref_time_value` through the last -#' `time_value` available as of version `ref_time_value` (typically, this -#' won't include `ref_time_value` itself, as observations about a particular -#' time interval (e.g., day) are only published after that time interval -#' ends); `epi_slide` windows extend from `before` time steps before a -#' `ref_time_value` through `after` time steps after `ref_time_value`. -#' 3. The input class and columns are similar but different: `epix_slide` -#' (with the default `all_versions=FALSE`) keeps all columns and the -#' `epi_df`-ness of the first argument to each computation; `epi_slide` only -#' provides the grouping variables in the second input, and will convert the -#' first input into a regular tibble if the grouping variables include the -#' essential `geo_value` column. (With `all_versions=TRUE`, `epix_slide` will -#' will provide an `epi_archive` rather than an `epi-df` to each -#' computation.) -#' 4. The output class and columns are similar but different: `epix_slide()` -#' returns a tibble containing only the grouping variables, `time_value`, and -#' the new column(s) from the slide computations, whereas `epi_slide()` -#' returns an `epi_df` with all original variables plus the new columns from -#' the slide computations. (Both will mirror the grouping or ungroupedness of -#' their input, with one exception: `epi_archive`s can have trivial -#' (zero-variable) groupings, but these will be dropped in `epix_slide` -#' results as they are not supported by tibbles.) -#' 5. There are no size stability checks or element/row recycling to maintain -#' size stability in `epix_slide`, unlike in `epi_slide`. (`epix_slide` is -#' roughly analogous to [`dplyr::group_modify`], while `epi_slide` is roughly -#' analogous to `dplyr::mutate` followed by `dplyr::arrange`) This is detailed -#' in the "advanced" vignette. -#' 6. `all_rows` is not supported in `epix_slide`; since the slide -#' computations are allowed more flexibility in their outputs than in -#' `epi_slide`, we can't guess a good representation for missing computations -#' for excluded group-`ref_time_value` pairs. -#' 7. The `ref_time_values` default for `epix_slide` is based on making an -#' evenly-spaced sequence out of the `version`s in the `DT` plus the -#' `versions_end`, rather than the `time_value`s. -#' -#' Apart from the above distinctions, the interfaces between `epix_slide()` and -#' `epi_slide()` are the same. -#' -#' Furthermore, the current function can be considerably slower than -#' `epi_slide()`, for two reasons: (1) it must repeatedly fetch -#' properly-versioned snapshots from the data archive (via its `as_of()` -#' method), and (2) it performs a "manual" sliding of sorts, and does not -#' benefit from the highly efficient `slider` package. For this reason, it -#' should never be used in place of `epi_slide()`, and only used when -#' version-aware sliding is necessary (as it its purpose). -#' -#' Finally, this is simply a wrapper around the `slide()` method of the -#' `epi_archive` and `grouped_epi_archive` classes, so if `x` is an -#' object of either of these classes, then: -#' ``` -#' epix_slide(x, new_var = comp(old_var), before = 119) -#' ``` -#' is equivalent to: -#' ``` -#' x$slide(new_var = comp(old_var), before = 119) -#' ``` -#' -#' Mutation and aliasing: `epix_slide` and `$slide` will not perform in-place -#' mutation of the input archives on their own. In some edge cases the inputs it -#' feeds to the slide computations may alias parts of the input archive, so copy -#' the slide computation inputs if needed before using mutating operations like -#' `data.table`'s `:=` operator. Similarly, in some edge cases, the output of -#' the slide operation may alias parts of the input archive, so similarly, make -#' sure to clone and/or copy appropriately before using in-place mutation. -#' -#' @examples -#' library(dplyr) -#' -#' # Reference time points for which we want to compute slide values: -#' ref_time_values <- seq(as.Date("2020-06-01"), -#' as.Date("2020-06-15"), -#' by = "1 day" -#' ) -#' -#' # A simple (but not very useful) example (see the archive vignette for a more -#' # realistic one): -#' archive_cases_dv_subset_2 %>% -#' group_by(geo_value) %>% -#' epix_slide2( -#' f = ~ mean(.x$case_rate_7d_av), -#' before = 2, -#' ref_time_values = ref_time_values, -#' new_col_name = "case_rate_7d_av_recent_av" -#' ) %>% -#' ungroup() -#' # We requested time windows that started 2 days before the corresponding time -#' # values. The actual number of `time_value`s in each computation depends on -#' # the reporting latency of the signal and `time_value` range covered by the -#' # archive (2020-06-01 -- 2021-11-30 in this example). In this case, we have -#' # * 0 `time_value`s, for ref time 2020-06-01 --> the result is automatically -#' # discarded -#' # * 1 `time_value`, for ref time 2020-06-02 -#' # * 2 `time_value`s, for the rest of the results -#' # * never the 3 `time_value`s we would get from `epi_slide`, since, because -#' # of data latency, we'll never have an observation -#' # `time_value == ref_time_value` as of `ref_time_value`. -#' # The example below shows this type of behavior in more detail. -#' -#' # Examining characteristics of the data passed to each computation with -#' # `all_versions=FALSE`. -#' archive_cases_dv_subset_2 %>% -#' group_by(geo_value) %>% -#' epix_slide2( -#' function(x, gk, rtv) { -#' tibble( -#' time_range = if (nrow(x) == 0L) { -#' "0 `time_value`s" -#' } else { -#' sprintf("%s -- %s", min(x$time_value), max(x$time_value)) -#' }, -#' n = nrow(x), -#' class1 = class(x)[[1L]] -#' ) -#' }, -#' before = 5, all_versions = FALSE, -#' ref_time_values = ref_time_values, names_sep = NULL -#' ) %>% -#' ungroup() %>% -#' arrange(geo_value, time_value) -#' -#' # --- Advanced: --- -#' -#' # `epix_slide` with `all_versions=FALSE` (the default) applies a -#' # version-unaware computation to several versions of the data. We can also -#' # use `all_versions=TRUE` to apply a version-*aware* computation to several -#' # versions of the data, again looking at characteristics of the data passed -#' # to each computation. In this case, each computation should expect an -#' # `epi_archive` containing the relevant version data: -#' -#' archive_cases_dv_subset_2 %>% -#' group_by(geo_value) %>% -#' epix_slide2( -#' function(x, gk, rtv) { -#' tibble( -#' versions_start = if (nrow(x$DT) == 0L) { -#' "NA (0 rows)" -#' } else { -#' toString(min(x$DT$version)) -#' }, -#' versions_end = x$versions_end, -#' time_range = if (nrow(x$DT) == 0L) { -#' "0 `time_value`s" -#' } else { -#' sprintf("%s -- %s", min(x$DT$time_value), max(x$DT$time_value)) -#' }, -#' n = nrow(x$DT), -#' class1 = class(x)[[1L]] -#' ) -#' }, -#' before = 5, all_versions = TRUE, -#' ref_time_values = ref_time_values, names_sep = NULL -#' ) %>% -#' ungroup() %>% -#' # Focus on one geo_value so we can better see the columns above: -#' filter(geo_value == "ca") %>% -#' select(-geo_value) -#' -#' @importFrom rlang enquo !!! -#' @export -epix_slide2 <- function(x, f, ..., before, ref_time_values, - time_step, new_col_name = "slide_value", - as_list_col = FALSE, names_sep = "_", - all_versions = FALSE) { - if (!is_epi_archive2(x, grouped_okay = TRUE)) { - cli_abort("`x` must be of class `epi_archive` or `grouped_epi_archive`.") - } - return(slide(x, f, ..., - before = before, - ref_time_values = ref_time_values, - time_step = time_step, - new_col_name = new_col_name, - as_list_col = as_list_col, - names_sep = names_sep, - all_versions = all_versions - )) -} - - -#' Filter an `epi_archive` object to keep only older versions -#' -#' Generates a filtered `epi_archive` from an `epi_archive` object, keeping -#' only rows with `version` falling on or before a specified date. -#' -#' @param x An `epi_archive` object -#' @param max_version Time value specifying the max version to permit in the -#' filtered archive. That is, the output archive will comprise rows of the -#' current archive data having `version` less than or equal to the -#' specified `max_version` -#' @return An `epi_archive` object -#' -#' @export -epix_truncate_versions_after <- function(x, max_version) { - UseMethod("epix_truncate_versions_after") -} - -#' @export -epix_truncate_versions_after.epi_archive2 <- function(x, max_version) { - cloned_epi_archive <- clone(x) - return((truncate_versions_after(x, max_version))) - # ^ second set of parens drops invisibility -} diff --git a/R/utils.R b/R/utils.R index 57a7f53a..ea7afc2f 100644 --- a/R/utils.R +++ b/R/utils.R @@ -359,8 +359,6 @@ as_slide_computation <- function(f, ...) { ) } -min_na_rm <- function(x) min(x, na.rm = TRUE) -extend_r <- function(x) c(x, x[length(x)]) guess_geo_type <- function(geo_value) { if (is.character(geo_value)) { diff --git a/_pkgdown.yml b/_pkgdown.yml index 1daef5a0..4930c9f5 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -95,6 +95,9 @@ reference: - title: Basic automatic plotting - contents: - autoplot.epi_df + - title: Advanced internals + - contents: + - compactify - title: internal - contents: - epiprocess diff --git a/man/as_epi_archive.Rd b/man/as_epi_archive.Rd deleted file mode 100644 index 93b10736..00000000 --- a/man/as_epi_archive.Rd +++ /dev/null @@ -1,142 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive.R -\name{as_epi_archive} -\alias{as_epi_archive} -\title{Convert to \code{epi_archive} format} -\usage{ -as_epi_archive( - x, - geo_type, - time_type, - other_keys, - additional_metadata = list(), - compactify = NULL, - clobberable_versions_start = NA, - versions_end = max_version_with_row_in(x) -) -} -\arguments{ -\item{x}{A data frame, data table, or tibble, with columns \code{geo_value}, -\code{time_value}, \code{version}, and then any additional number of columns.} - -\item{geo_type}{Type for the geo values. If missing, then the function will -attempt to infer it from the geo values present; if this fails, then it -will be set to "custom".} - -\item{time_type}{Type for the time values. If missing, then the function will -attempt to infer it from the time values present; if this fails, then it -will be set to "custom".} - -\item{other_keys}{Character vector specifying the names of variables in \code{x} -that should be considered key variables (in the language of \code{data.table}) -apart from "geo_value", "time_value", and "version".} - -\item{additional_metadata}{List of additional metadata to attach to the -\code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} -fields; named entries from the passed list or will be included as well.} - -\item{compactify}{Optional; Boolean or \code{NULL}: should we remove rows that are -considered redundant for the purposes of \code{epi_archive}'s built-in methods -such as \code{as_of}? As these methods use the last version of each observation -carried forward (LOCF) to interpolate between the version data provided, -rows that don't change these LOCF results can potentially be omitted to -save space. \code{TRUE} will remove these rows, \code{FALSE} will not, and missing or -\code{NULL} will remove these rows and issue a warning. Generally, this can be -set to \code{TRUE}, but if you directly inspect or edit the fields of the -\code{epi_archive} such as its \code{DT}, you will have to determine whether -\code{compactify=TRUE} will produce the desired results. If compactification -here is removing a large proportion of the rows, this may indicate a -potential for space, time, or bandwidth savings upstream the data pipeline, -e.g., when fetching, storing, or preparing the input data \code{x}} - -\item{clobberable_versions_start}{Optional; \code{length}-1; either a value of the -same \code{class} and \code{typeof} as \code{x$version}, or an \code{NA} of any \code{class} and -\code{typeof}: specifically, either (a) the earliest version that could be -subject to "clobbering" (being overwritten with different update data, but -using the \emph{same} version tag as the old update data), or (b) \code{NA}, to -indicate that no versions are clobberable. There are a variety of reasons -why versions could be clobberable under routine circumstances, such as (a) -today's version of one/all of the columns being published after initially -being filled with \code{NA} or LOCF, (b) a buggy version of today's data being -published but then fixed and republished later in the day, or (c) data -pipeline delays (e.g., publisher uploading, periodic scraping, database -syncing, periodic fetching, etc.) that make events (a) or (b) reflected -later in the day (or even on a different day) than expected; potential -causes vary between different data pipelines. The default value is \code{NA}, -which doesn't consider any versions to be clobberable. Another setting that -may be appropriate for some pipelines is \code{max_version_with_row_in(x)}.} - -\item{versions_end}{Optional; length-1, same \code{class} and \code{typeof} as -\code{x$version}: what is the last version we have observed? The default is -\code{max_version_with_row_in(x)}, but values greater than this could also be -valid, and would indicate that we observed additional versions of the data -beyond \code{max(x$version)}, but they all contained empty updates. (The default -value of \code{clobberable_versions_start} does not fully trust these empty -updates, and assumes that any version \verb{>= max(x$version)} could be -clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} -} -\value{ -An \code{epi_archive} object. -} -\description{ -Converts a data frame, data table, or tibble into an \code{epi_archive} -object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/archive.html}{archive vignette} for -examples. The parameter descriptions below are copied from there -} -\details{ -This simply a wrapper around the \code{new()} method of the \code{epi_archive} -class, so for example: - -\if{html}{\out{
}}\preformatted{x <- as_epi_archive(df, geo_type = "state", time_type = "day") -}\if{html}{\out{
}} - -would be equivalent to: - -\if{html}{\out{
}}\preformatted{x <- epi_archive$new(df, geo_type = "state", time_type = "day") -}\if{html}{\out{
}} -} -\examples{ -# Simple ex. with necessary keys -tib <- tibble::tibble( - geo_value = rep(c("ca", "hi"), each = 5), - time_value = rep(seq(as.Date("2020-01-01"), - by = 1, length.out = 5 - ), times = 2), - version = rep(seq(as.Date("2020-01-02"), - by = 1, length.out = 5 - ), times = 2), - value = rnorm(10, mean = 2, sd = 1) -) - -toy_epi_archive <- tib \%>\% as_epi_archive( - geo_type = "state", - time_type = "day" -) -toy_epi_archive - -# Ex. with an additional key for county -df <- data.frame( - geo_value = c(replicate(2, "ca"), replicate(2, "fl")), - county = c(1, 3, 2, 5), - time_value = c( - "2020-06-01", - "2020-06-02", - "2020-06-01", - "2020-06-02" - ), - version = c( - "2020-06-02", - "2020-06-03", - "2020-06-02", - "2020-06-03" - ), - cases = c(1, 2, 3, 4), - cases_rate = c(0.01, 0.02, 0.01, 0.05) -) - -x <- df \%>\% as_epi_archive( - geo_type = "state", - time_type = "day", - other_keys = "county" -) -} diff --git a/man/as_epi_archive2.Rd b/man/as_epi_archive2.Rd deleted file mode 100644 index bc3f5185..00000000 --- a/man/as_epi_archive2.Rd +++ /dev/null @@ -1,142 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive_new.R -\name{as_epi_archive2} -\alias{as_epi_archive2} -\title{Convert to \code{epi_archive} format} -\usage{ -as_epi_archive2( - x, - geo_type, - time_type, - other_keys, - additional_metadata = list(), - compactify = NULL, - clobberable_versions_start = NA, - versions_end = max_version_with_row_in(x) -) -} -\arguments{ -\item{x}{A data frame, data table, or tibble, with columns \code{geo_value}, -\code{time_value}, \code{version}, and then any additional number of columns.} - -\item{geo_type}{Type for the geo values. If missing, then the function will -attempt to infer it from the geo values present; if this fails, then it -will be set to "custom".} - -\item{time_type}{Type for the time values. If missing, then the function will -attempt to infer it from the time values present; if this fails, then it -will be set to "custom".} - -\item{other_keys}{Character vector specifying the names of variables in \code{x} -that should be considered key variables (in the language of \code{data.table}) -apart from "geo_value", "time_value", and "version".} - -\item{additional_metadata}{List of additional metadata to attach to the -\code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} -fields; named entries from the passed list or will be included as well.} - -\item{compactify}{Optional; Boolean or \code{NULL}: should we remove rows that are -considered redundant for the purposes of \code{epi_archive}'s built-in methods -such as \code{as_of}? As these methods use the last version of each observation -carried forward (LOCF) to interpolate between the version data provided, -rows that don't change these LOCF results can potentially be omitted to -save space. \code{TRUE} will remove these rows, \code{FALSE} will not, and missing or -\code{NULL} will remove these rows and issue a warning. Generally, this can be -set to \code{TRUE}, but if you directly inspect or edit the fields of the -\code{epi_archive} such as its \code{DT}, you will have to determine whether -\code{compactify=TRUE} will produce the desired results. If compactification -here is removing a large proportion of the rows, this may indicate a -potential for space, time, or bandwidth savings upstream the data pipeline, -e.g., when fetching, storing, or preparing the input data \code{x}} - -\item{clobberable_versions_start}{Optional; \code{length}-1; either a value of the -same \code{class} and \code{typeof} as \code{x$version}, or an \code{NA} of any \code{class} and -\code{typeof}: specifically, either (a) the earliest version that could be -subject to "clobbering" (being overwritten with different update data, but -using the \emph{same} version tag as the old update data), or (b) \code{NA}, to -indicate that no versions are clobberable. There are a variety of reasons -why versions could be clobberable under routine circumstances, such as (a) -today's version of one/all of the columns being published after initially -being filled with \code{NA} or LOCF, (b) a buggy version of today's data being -published but then fixed and republished later in the day, or (c) data -pipeline delays (e.g., publisher uploading, periodic scraping, database -syncing, periodic fetching, etc.) that make events (a) or (b) reflected -later in the day (or even on a different day) than expected; potential -causes vary between different data pipelines. The default value is \code{NA}, -which doesn't consider any versions to be clobberable. Another setting that -may be appropriate for some pipelines is \code{max_version_with_row_in(x)}.} - -\item{versions_end}{Optional; length-1, same \code{class} and \code{typeof} as -\code{x$version}: what is the last version we have observed? The default is -\code{max_version_with_row_in(x)}, but values greater than this could also be -valid, and would indicate that we observed additional versions of the data -beyond \code{max(x$version)}, but they all contained empty updates. (The default -value of \code{clobberable_versions_start} does not fully trust these empty -updates, and assumes that any version \verb{>= max(x$version)} could be -clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} -} -\value{ -An \code{epi_archive} object. -} -\description{ -Converts a data frame, data table, or tibble into an \code{epi_archive} -object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/archive.html}{archive vignette} for -examples. The parameter descriptions below are copied from there -} -\details{ -This simply a wrapper around the \code{new()} method of the \code{epi_archive} -class, so for example: - -\if{html}{\out{
}}\preformatted{x <- as_epi_archive(df, geo_type = "state", time_type = "day") -}\if{html}{\out{
}} - -would be equivalent to: - -\if{html}{\out{
}}\preformatted{x <- epi_archive$new(df, geo_type = "state", time_type = "day") -}\if{html}{\out{
}} -} -\examples{ -# Simple ex. with necessary keys -tib <- tibble::tibble( - geo_value = rep(c("ca", "hi"), each = 5), - time_value = rep(seq(as.Date("2020-01-01"), - by = 1, length.out = 5 - ), times = 2), - version = rep(seq(as.Date("2020-01-02"), - by = 1, length.out = 5 - ), times = 2), - value = rnorm(10, mean = 2, sd = 1) -) - -toy_epi_archive <- tib \%>\% as_epi_archive2( - geo_type = "state", - time_type = "day" -) -toy_epi_archive - -# Ex. with an additional key for county -df <- data.frame( - geo_value = c(replicate(2, "ca"), replicate(2, "fl")), - county = c(1, 3, 2, 5), - time_value = c( - "2020-06-01", - "2020-06-02", - "2020-06-01", - "2020-06-02" - ), - version = c( - "2020-06-02", - "2020-06-03", - "2020-06-02", - "2020-06-03" - ), - cases = c(1, 2, 3, 4), - cases_rate = c(0.01, 0.02, 0.01, 0.05) -) - -x <- df \%>\% as_epi_archive2( - geo_type = "state", - time_type = "day", - other_keys = "county" -) -} diff --git a/man/as_of.epi_archive2.Rd b/man/as_of.epi_archive2.Rd deleted file mode 100644 index 21a4cfc1..00000000 --- a/man/as_of.epi_archive2.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive_new.R -\name{as_of.epi_archive2} -\alias{as_of.epi_archive2} -\title{As of epi_archive} -\usage{ -\method{as_of}{epi_archive2}(epi_archive, max_version, min_time_value = -Inf, all_versions = FALSE) -} -\arguments{ -\item{epi_archive}{An \code{epi_archive} object} - -\item{max_version}{Version specifying the max version to permit in the -snapshot. That is, the snapshot will comprise the unique rows of the -current archive data that represent the most up-to-date signal values, as -of the specified \code{max_version} (and whose \code{time_value}s are at least -\code{min_time_value}).} - -\item{min_time_value}{Time value specifying the min \code{time_value} to permit in -the snapshot. Default is \code{-Inf}, which effectively means that there is no -minimum considered.} - -\item{all_versions}{Boolean; If \code{all_versions = TRUE}, then the output will be in -\code{epi_archive} format, and contain rows in the specified \code{time_value} range -having \code{version <= max_version}. The resulting object will cover a -potentially narrower \code{version} and \code{time_value} range than \code{x}, depending -on user-provided arguments. Otherwise, there will be one row in the output -for the \code{max_version} of each \code{time_value}. Default is \code{FALSE}.} -} -\description{ -Generates a snapshot in \code{epi_df} format as of a given version. -See the documentation for the wrapper function \code{\link[=epix_as_of]{epix_as_of()}} for -details. The parameter descriptions below are copied from there -} diff --git a/man/clone.Rd b/man/clone.Rd new file mode 100644 index 00000000..a5597e3b --- /dev/null +++ b/man/clone.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive.R +\name{clone} +\alias{clone} +\alias{clone.epi_archive} +\title{Clone an \code{epi_archive} object.} +\usage{ +clone(x) + +\method{clone}{epi_archive}(x) +} +\arguments{ +\item{x}{An \code{epi_archive} object.} +} +\description{ +Clone an \code{epi_archive} object. +} diff --git a/man/compactify.Rd b/man/compactify.Rd new file mode 100644 index 00000000..2f210315 --- /dev/null +++ b/man/compactify.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive.R +\name{compactify} +\alias{compactify} +\title{Compactify} +\description{ +This section describes the internals of how compactification works in an +\code{epi_archive()}. Compactification can potentially improve code speed or +memory usage, depending on your data. +} +\details{ +In general, the last version of each observation is carried forward (LOCF) to +fill in data between recorded versions, and between the last recorded +update and the \code{versions_end}. One consequence is that the \code{DT} doesn't +have to contain a full snapshot of every version (although this generally +works), but can instead contain only the rows that are new or changed from +the previous version (see \code{compactify}, which does this automatically). +Currently, deletions must be represented as revising a row to a special +state (e.g., making the entries \code{NA} or including a special column that +flags the data as removed and performing some kind of post-processing), and +the archive is unaware of what this state is. Note that \code{NA}s \emph{can} be +introduced by \code{epi_archive} methods for other reasons, e.g., in +\code{\link{epix_fill_through_version}} and \code{\link{epix_merge}}, if requested, to +represent potential update data that we do not yet have access to; or in +\code{\link{epix_merge}} to represent the "value" of an observation before the +version in which it was first released, or if no version of that +observation appears in the archive data at all. +} diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index 86e21b89..b7dd649e 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -1,73 +1,97 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive.R, R/archive_new.R +% Please edit documentation in R/archive.R \name{epi_archive} \alias{epi_archive} +\alias{new_epi_archive} +\alias{as_epi_archive} \title{\code{epi_archive} object} -\description{ -An \code{epi_archive} is an R6 class which contains a data table -along with several relevant pieces of metadata. The data table can be seen -as the full archive (version history) for some signal variables of -interest. +\usage{ +new_epi_archive( + x, + geo_type = NULL, + time_type = NULL, + other_keys = NULL, + additional_metadata = NULL, + compactify = NULL, + clobberable_versions_start = NA, + versions_end = NULL +) -An \code{epi_archive} is an R6 class which contains a data table -along with several relevant pieces of metadata. The data table can be seen -as the full archive (version history) for some signal variables of -interest. -} -\details{ -An \code{epi_archive} is an R6 class which contains a data table \code{DT}, of -class \code{data.table} from the \code{data.table} package, with (at least) the -following columns: -\itemize{ -\item \code{geo_value}: the geographic value associated with each row of measurements. -\item \code{time_value}: the time value associated with each row of measurements. -\item \code{version}: the time value specifying the version for each row of -measurements. For example, if in a given row the \code{version} is January 15, -2022 and \code{time_value} is January 14, 2022, then this row contains the -measurements of the data for January 14, 2022 that were available one day -later. +as_epi_archive( + x, + geo_type = NULL, + time_type = NULL, + other_keys = NULL, + additional_metadata = list(), + compactify = NULL, + clobberable_versions_start = NA, + versions_end = max_version_with_row_in(x) +) } +\arguments{ +\item{x}{A data.frame, data.table, or tibble, with columns \code{geo_value}, +\code{time_value}, \code{version}, and then any additional number of columns.} -The data table \code{DT} has key variables \code{geo_value}, \code{time_value}, \code{version}, -as well as any others (these can be specified when instantiating the -\code{epi_archive} object via the \code{other_keys} argument, and/or set by operating -on \code{DT} directly). Refer to the documentation for \code{\link[=as_epi_archive]{as_epi_archive()}} for -information and examples of relevant parameter names for an \code{epi_archive} object. -Note that there can only be a single row per unique combination of -key variables, and thus the key variables are critical for figuring out how -to generate a snapshot of data from the archive, as of a given version. +\item{geo_type}{Type for the geo values. If missing, then the function will +attempt to infer it from the geo values present; if this fails, then it +will be set to "custom".} -In general, the last version of each observation is carried forward (LOCF) to -fill in data between recorded versions, and between the last recorded -update and the \code{versions_end}. One consequence is that the \code{DT} -doesn't have to contain a full snapshot of every version (although this -generally works), but can instead contain only the rows that are new or -changed from the previous version (see \code{compactify}, which does this -automatically). Currently, deletions must be represented as revising a row -to a special state (e.g., making the entries \code{NA} or including a special -column that flags the data as removed and performing some kind of -post-processing), and the archive is unaware of what this state is. Note -that \code{NA}s \emph{can} be introduced by \code{epi_archive} methods for other reasons, -e.g., in \code{\link{epix_fill_through_version}} and \code{\link{epix_merge}}, if requested, to -represent potential update data that we do not yet have access to; or in -\code{\link{epix_merge}} to represent the "value" of an observation before the -version in which it was first released, or if no version of that -observation appears in the archive data at all. +\item{time_type}{Type for the time values. If missing, then the function will +attempt to infer it from the time values present; if this fails, then it +will be set to "custom".} + +\item{other_keys}{Character vector specifying the names of variables in \code{x} +that should be considered key variables (in the language of \code{data.table}) +apart from "geo_value", "time_value", and "version".} -\strong{A word of caution:} R6 objects, unlike most other objects in R, have -reference semantics. A primary consequence of this is that objects are not -copied when modified. You can read more about this in Hadley Wickham's -\href{https://adv-r.hadley.nz/r6.html#r6-semantics}{Advanced R} book. In order -to construct a modified archive while keeping the original intact, first -make a clone using the \verb{$clone} method, then overwrite the clone's \code{DT} -field with \code{data.table::copy(clone$DT)}, and finally perform the -modifications on the clone. +\item{additional_metadata}{List of additional metadata to attach to the +\code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} +fields; named entries from the passed list or will be included as well.} -epi archive +\item{compactify}{Optional; Boolean or \code{NULL}. \code{TRUE} will remove some +redundant rows, \code{FALSE} will not, and missing or \code{NULL} will remove +redundant rows, but issue a warning. See more information at \code{compactify}.} + +\item{clobberable_versions_start}{Optional; \code{length}-1; either a value of the +same \code{class} and \code{typeof} as \code{x$version}, or an \code{NA} of any \code{class} and +\code{typeof}: specifically, either (a) the earliest version that could be +subject to "clobbering" (being overwritten with different update data, but +using the \emph{same} version tag as the old update data), or (b) \code{NA}, to +indicate that no versions are clobberable. There are a variety of reasons +why versions could be clobberable under routine circumstances, such as (a) +today's version of one/all of the columns being published after initially +being filled with \code{NA} or LOCF, (b) a buggy version of today's data being +published but then fixed and republished later in the day, or (c) data +pipeline delays (e.g., publisher uploading, periodic scraping, database +syncing, periodic fetching, etc.) that make events (a) or (b) reflected +later in the day (or even on a different day) than expected; potential +causes vary between different data pipelines. The default value is \code{NA}, +which doesn't consider any versions to be clobberable. Another setting that +may be appropriate for some pipelines is \code{max_version_with_row_in(x)}.} + +\item{versions_end}{Optional; length-1, same \code{class} and \code{typeof} as +\code{x$version}: what is the last version we have observed? The default is +\code{max_version_with_row_in(x)}, but values greater than this could also be +valid, and would indicate that we observed additional versions of the data +beyond \code{max(x$version)}, but they all contained empty updates. (The default +value of \code{clobberable_versions_start} does not fully trust these empty +updates, and assumes that any version \verb{>= max(x$version)} could be +clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} +} +\value{ +An \code{epi_archive} object. +} +\description{ +An \code{epi_archive} is an S3 class which contains a data table +along with several relevant pieces of metadata. The data table can be seen +as the full archive (version history) for some signal variables of +interest. +} +\details{ +Epi Archive -An \code{epi_archive} is an R6 class which contains a data table \code{DT}, of -class \code{data.table} from the \code{data.table} package, with (at least) the -following columns: +An \code{epi_archive} contains a data table \code{DT}, of class \code{data.table} +from the \code{data.table} package, with (at least) the following columns: \itemize{ \item \code{geo_value}: the geographic value associated with each row of measurements. \item \code{time_value}: the time value associated with each row of measurements. @@ -81,56 +105,14 @@ later. The data table \code{DT} has key variables \code{geo_value}, \code{time_value}, \code{version}, as well as any others (these can be specified when instantiating the \code{epi_archive} object via the \code{other_keys} argument, and/or set by operating -on \code{DT} directly). Refer to the documentation for \code{\link[=as_epi_archive]{as_epi_archive()}} for -information and examples of relevant parameter names for an \code{epi_archive} object. -Note that there can only be a single row per unique combination of +on \code{DT} directly). Refer to the documentation for \code{as_epi_archive()} for +information and examples of relevant parameter names for an \code{epi_archive} +object. Note that there can only be a single row per unique combination of key variables, and thus the key variables are critical for figuring out how to generate a snapshot of data from the archive, as of a given version. - -In general, the last version of each observation is carried forward (LOCF) to -fill in data between recorded versions, and between the last recorded -update and the \code{versions_end}. One consequence is that the \code{DT} -doesn't have to contain a full snapshot of every version (although this -generally works), but can instead contain only the rows that are new or -changed from the previous version (see \code{compactify}, which does this -automatically). Currently, deletions must be represented as revising a row -to a special state (e.g., making the entries \code{NA} or including a special -column that flags the data as removed and performing some kind of -post-processing), and the archive is unaware of what this state is. Note -that \code{NA}s \emph{can} be introduced by \code{epi_archive} methods for other reasons, -e.g., in \code{\link{epix_fill_through_version}} and \code{\link{epix_merge}}, if requested, to -represent potential update data that we do not yet have access to; or in -\code{\link{epix_merge}} to represent the "value" of an observation before the -version in which it was first released, or if no version of that -observation appears in the archive data at all. - -\strong{A word of caution:} R6 objects, unlike most other objects in R, have -reference semantics. A primary consequence of this is that objects are not -copied when modified. You can read more about this in Hadley Wickham's -\href{https://adv-r.hadley.nz/r6.html#r6-semantics}{Advanced R} book. In order -to construct a modified archive while keeping the original intact, first -make a clone using the \verb{$clone} method, then overwrite the clone's \code{DT} -field with \code{data.table::copy(clone$DT)}, and finally perform the -modifications on the clone. } \section{Metadata}{ -The following pieces of metadata are included as fields in an \code{epi_archive} -object: -\itemize{ -\item \code{geo_type}: the type for the geo values. -\item \code{time_type}: the type for the time values. -\item \code{additional_metadata}: list of additional metadata for the data archive. -} - -Unlike an \code{epi_df} object, metadata for an \code{epi_archive} object \code{x} can be -accessed (and altered) directly, as in \code{x$geo_type} or \code{x$time_type}, -etc. Like an \code{epi_df} object, the \code{geo_type} and \code{time_type} fields in the -metadata of an \code{epi_archive} object are not currently used by any -downstream functions in the \code{epiprocess} package, and serve only as useful -bits of information to convey about the data set at hand. - - The following pieces of metadata are included as fields in an \code{epi_archive} object: \itemize{ @@ -151,16 +133,8 @@ bits of information to convey about the data set at hand. An \code{epi_archive} object can be used to generate a snapshot of the data in \code{epi_df} format, which represents the most up-to-date values of the signal -variables, as of the specified version. This is accomplished by calling the -\code{as_of()} method for an \code{epi_archive} object \code{x}. More details on this -method are documented in the wrapper function \code{\link[=epix_as_of]{epix_as_of()}}. - - -An \code{epi_archive} object can be used to generate a snapshot of the data in -\code{epi_df} format, which represents the most up-to-date values of the signal -variables, as of the specified version. This is accomplished by calling the -\code{as_of()} method for an \code{epi_archive} object \code{x}. More details on this -method are documented in the wrapper function \code{\link[=epix_as_of]{epix_as_of()}}. +variables, as of the specified version. This is accomplished by calling +\code{epix_as_of()}. } \section{Sliding Computations}{ @@ -171,21 +145,11 @@ the \code{slide()} method for an \code{epi_archive} object, which works similarl the way \code{epi_slide()} works for an \code{epi_df} object, but with one key difference: it is version-aware. That is, for an \code{epi_archive} object, the sliding computation at any given reference time point t is performed on -\strong{data that would have been available as of t}. More details on \code{slide()} -are documented in the wrapper function \code{\link[=epix_slide]{epix_slide()}}. - - -We can run a sliding computation over an \code{epi_archive} object, much like -\code{epi_slide()} does for an \code{epi_df} object. This is accomplished by calling -the \code{slide()} method for an \code{epi_archive} object, which works similarly to -the way \code{epi_slide()} works for an \code{epi_df} object, but with one key -difference: it is version-aware. That is, for an \code{epi_archive} object, the -sliding computation at any given reference time point t is performed on -\strong{data that would have been available as of t}. More details on \code{slide()} -are documented in the wrapper function \code{\link[=epix_slide]{epix_slide()}}. +\strong{data that would have been available as of t}. } \examples{ +# Simple ex. with necessary keys tib <- tibble::tibble( geo_value = rep(c("ca", "hi"), each = 5), time_value = rep(seq(as.Date("2020-01-01"), @@ -197,419 +161,36 @@ tib <- tibble::tibble( value = rnorm(10, mean = 2, sd = 1) ) -toy_epi_archive <- tib \%>\% epi_archive$new( +toy_epi_archive <- tib \%>\% as_epi_archive( geo_type = "state", time_type = "day" ) toy_epi_archive -tib <- tibble::tibble( - geo_value = rep(c("ca", "hi"), each = 5), - time_value = rep(seq(as.Date("2020-01-01"), - by = 1, length.out = 5 - ), times = 2), - version = rep(seq(as.Date("2020-01-02"), - by = 1, length.out = 5 - ), times = 2), - value = rnorm(10, mean = 2, sd = 1) + +# Ex. with an additional key for county +df <- data.frame( + geo_value = c(replicate(2, "ca"), replicate(2, "fl")), + county = c(1, 3, 2, 5), + time_value = c( + "2020-06-01", + "2020-06-02", + "2020-06-01", + "2020-06-02" + ), + version = c( + "2020-06-02", + "2020-06-03", + "2020-06-02", + "2020-06-03" + ), + cases = c(1, 2, 3, 4), + cases_rate = c(0.01, 0.02, 0.01, 0.05) ) -toy_epi_archive <- tib \%>\% new_epi_archive2( +x <- df \%>\% as_epi_archive( geo_type = "state", - time_type = "day" + time_type = "day", + other_keys = "county" ) -toy_epi_archive -} -\section{Public fields}{ -\if{html}{\out{
}} -\describe{ -\item{\code{DT}}{(\code{data.table})\cr -the (optionally compactified) datatable} - -\item{\code{geo_type}}{(string)\cr -the resolution of the geographic label (e.g. state)} - -\item{\code{time_type}}{(string)\cr -the resolution of the time column (e.g. day)} - -\item{\code{additional_metadata}}{(named list)\cr -any extra fields, such as \code{other_keys}} -\item{\code{clobberable_versions_start}}{(length-1 of same type&class as \code{version} column, or \code{NA})\cr -the earliest version number that might be rewritten in the future without assigning a new version -date/number, or \code{NA} if this won't happen} - -\item{\code{versions_end}}{(length-1 of same type&class as \code{version} column)\cr -the latest version observed} -} -\if{html}{\out{
}} -} -\section{Methods}{ -\subsection{Public methods}{ -\itemize{ -\item \href{#method-epi_archive-new}{\code{epi_archive$new()}} -\item \href{#method-epi_archive-print}{\code{epi_archive$print()}} -\item \href{#method-epi_archive-as_of}{\code{epi_archive$as_of()}} -\item \href{#method-epi_archive-fill_through_version}{\code{epi_archive$fill_through_version()}} -\item \href{#method-epi_archive-truncate_versions_after}{\code{epi_archive$truncate_versions_after()}} -\item \href{#method-epi_archive-merge}{\code{epi_archive$merge()}} -\item \href{#method-epi_archive-group_by}{\code{epi_archive$group_by()}} -\item \href{#method-epi_archive-slide}{\code{epi_archive$slide()}} -\item \href{#method-epi_archive-clone}{\code{epi_archive$clone()}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-epi_archive-new}{}}} -\subsection{Method \code{new()}}{ -Creates a new \code{epi_archive} object. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_archive$new( - x, - geo_type, - time_type, - other_keys, - additional_metadata, - compactify, - clobberable_versions_start, - versions_end -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{x}}{A data frame, data table, or tibble, with columns \code{geo_value}, -\code{time_value}, \code{version}, and then any additional number of columns.} - -\item{\code{geo_type}}{Type for the geo values. If missing, then the function will -attempt to infer it from the geo values present; if this fails, then it -will be set to "custom".} - -\item{\code{time_type}}{Type for the time values. If missing, then the function will -attempt to infer it from the time values present; if this fails, then it -will be set to "custom".} - -\item{\code{other_keys}}{Character vector specifying the names of variables in \code{x} -that should be considered key variables (in the language of \code{data.table}) -apart from "geo_value", "time_value", and "version".} - -\item{\code{additional_metadata}}{List of additional metadata to attach to the -\code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} -fields; named entries from the passed list or will be included as well.} - -\item{\code{compactify}}{Optional; Boolean or \code{NULL}: should we remove rows that are -considered redundant for the purposes of \code{epi_archive}'s built-in methods -such as \code{as_of}? As these methods use the last version of each observation -carried forward (LOCF) to interpolate between the version data provided, -rows that don't change these LOCF results can potentially be omitted to -save space while maintaining the same behavior (with the help of the -\code{clobberable_versions_start} and \code{versions_end} fields in some edge cases). -\code{TRUE} will remove these rows, \code{FALSE} will not, and missing or \code{NULL} will -remove these rows and issue a warning. Generally, this can be set to -\code{TRUE}, but if you directly inspect or edit the fields of the \code{epi_archive} -such as its \code{DT}, or rely on redundant updates to achieve a certain -behavior of the \code{ref_time_values} default in \code{epix_slide}, you will have to -determine whether \code{compactify=TRUE} will produce the desired results. If -compactification here is removing a large proportion of the rows, this may -indicate a potential for space, time, or bandwidth savings upstream the -data pipeline, e.g., by avoiding fetching, storing, or processing these -rows of \code{x}.} - -\item{\code{clobberable_versions_start}}{Optional; as in \code{\link{as_epi_archive}}} - -\item{\code{versions_end}}{Optional; as in \code{\link{as_epi_archive}}} -} -\if{html}{\out{
}} -} -\subsection{Details}{ -Refer to the documentation for \code{\link[=as_epi_archive]{as_epi_archive()}} for more information -and examples of parameter names. -Print information about an archive -} - -\subsection{Returns}{ -An \code{epi_archive} object. -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-epi_archive-print}{}}} -\subsection{Method \code{print()}}{ -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_archive$print(class = TRUE, methods = TRUE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{class}}{Boolean; whether to print the class label header} - -\item{\code{methods}}{Boolean; whether to print all available methods of -the archive} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-epi_archive-as_of}{}}} -\subsection{Method \code{as_of()}}{ -Generates a snapshot in \code{epi_df} format as of a given version. -See the documentation for the wrapper function \code{\link[=epix_as_of]{epix_as_of()}} for -details. The parameter descriptions below are copied from there -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_archive$as_of(max_version, min_time_value = -Inf, all_versions = FALSE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{max_version}}{Version specifying the max version to permit in the -snapshot. That is, the snapshot will comprise the unique rows of the -current archive data that represent the most up-to-date signal values, as -of the specified \code{max_version} (and whose \code{time_value}s are at least -\code{min_time_value}).} - -\item{\code{min_time_value}}{Time value specifying the min \code{time_value} to permit in -the snapshot. Default is \code{-Inf}, which effectively means that there is no -minimum considered.} - -\item{\code{all_versions}}{Boolean; If \code{all_versions = TRUE}, then the output will be in -\code{epi_archive} format, and contain rows in the specified \code{time_value} range -having \code{version <= max_version}. The resulting object will cover a -potentially narrower \code{version} and \code{time_value} range than \code{x}, depending -on user-provided arguments. Otherwise, there will be one row in the output -for the \code{max_version} of each \code{time_value}. Default is \code{FALSE}.} - -\item{\code{x}}{An \code{epi_archive} object} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-epi_archive-fill_through_version}{}}} -\subsection{Method \code{fill_through_version()}}{ -Fill in unobserved history using requested scheme by mutating -\code{self} and potentially reseating its fields. See -\code{\link{epix_fill_through_version}} for a full description of the non-R6-method -version, which doesn't mutate the input archive but might alias its fields. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_archive$fill_through_version(fill_versions_end, how = c("na", "locf"))}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{fill_versions_end}}{as in \code{\link{epix_fill_through_version}}} - -\item{\code{how}}{as in \code{\link{epix_fill_through_version}}} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-epi_archive-truncate_versions_after}{}}} -\subsection{Method \code{truncate_versions_after()}}{ -Filter to keep only older versions, mutating the archive by -potentially reseating but not mutating some fields. \code{DT} is likely, but not -guaranteed, to be copied. Returns the mutated archive -\link[base:invisible]{invisibly}. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_archive$truncate_versions_after(max_version)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{max_version}}{as in \code{\link{epix_truncate_versions_after}}} - -\item{\code{x}}{as in \code{\link{epix_truncate_versions_after}}} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-epi_archive-merge}{}}} -\subsection{Method \code{merge()}}{ -Merges another \code{epi_archive} with the current one, mutating the -current one by reseating its \code{DT} and several other fields, but avoiding -mutation of the old \code{DT}; returns the current archive -\link[base:invisible]{invisibly}. See \code{\link{epix_merge}} for a full description -of the non-R6-method version, which does not mutate either archive, and -does not alias either archive's \code{DT}. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_archive$merge( - y, - sync = c("forbid", "na", "locf", "truncate"), - compactify = TRUE -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{y}}{as in \code{\link{epix_merge}}} - -\item{\code{sync}}{as in \code{\link{epix_merge}}} - -\item{\code{compactify}}{as in \code{\link{epix_merge}} -group an epi_archive} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-epi_archive-group_by}{}}} -\subsection{Method \code{group_by()}}{ -group an epi_archive -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_archive$group_by( - ..., - .add = FALSE, - .drop = dplyr::group_by_drop_default(self) -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{...}}{variables or computations to group by. Computations are always -done on the ungrouped data frame. To perform computations on the grouped -data, you need to use a separate \code{\link[=mutate]{mutate()}} step before the -\code{\link[=group_by]{group_by()}}} - -\item{\code{.add}}{When \code{FALSE}, the default, \code{\link[=group_by]{group_by()}} will override existing -groups. To add to the existing groups, use \code{.add = TRUE}.} - -\item{\code{.drop}}{Drop groups formed by factor levels that don't appear in the -data. The default is \code{TRUE} except when \code{.data} has been previously grouped -with \code{.drop = FALSE}. See \code{\link[=group_by_drop_default]{group_by_drop_default()}} for details.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-epi_archive-slide}{}}} -\subsection{Method \code{slide()}}{ -Slides a given function over variables in an \code{epi_archive} -object. See the documentation for the wrapper function \code{\link[=epix_slide]{epix_slide()}} for -details. The parameter descriptions below are copied from there -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_archive$slide( - f, - ..., - before, - ref_time_values, - time_step, - new_col_name = "slide_value", - as_list_col = FALSE, - names_sep = "_", - all_versions = FALSE -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{f}}{Function, formula, or missing; together with \code{...} specifies the -computation to slide. To "slide" means to apply a computation over a -sliding (a.k.a. "rolling") time window for each data group. The window is -determined by the \code{before} parameter described below. One time step is -typically one day or one week; see \code{\link{epi_slide}} details for more -explanation. If a function, \code{f} must take an \code{epi_df} with the same -column names as the archive's \code{DT}, minus the \code{version} column; followed -by a one-row tibble containing the values of the grouping variables for -the associated group; followed by a reference time value, usually as a -\code{Date} object; followed by any number of named arguments. If a formula, -\code{f} can operate directly on columns accessed via \code{.x$var} or \code{.$var}, as -in \code{~ mean (.x$var)} to compute a mean of a column \code{var} for each -group-\code{ref_time_value} combination. The group key can be accessed via -\code{.y} or \code{.group_key}, and the reference time value can be accessed via -\code{.z} or \code{.ref_time_value}. If \code{f} is missing, then \code{...} will specify the -computation.} - -\item{\code{...}}{Additional arguments to pass to the function or formula specified -via \code{f}. Alternatively, if \code{f} is missing, then \code{...} is interpreted as an -expression for tidy evaluation; in addition to referring to columns -directly by name, the expression has access to \code{.data} and \code{.env} pronouns -as in \code{dplyr} verbs, and can also refer to the \code{.group_key} and -\code{.ref_time_value}. See details of \code{\link{epi_slide}}.} - -\item{\code{before}}{How far \code{before} each \code{ref_time_value} should the sliding -window extend? If provided, should be a single, non-NA, -\link[vctrs:vec_cast]{integer-compatible} number of time steps. This window -endpoint is inclusive. For example, if \code{before = 7}, and one time step is -one day, then to produce a value for a \code{ref_time_value} of January 8, we -apply the given function or formula to data (for each group present) with -\code{time_value}s from January 1 onward, as they were reported on January 8. -For typical disease surveillance sources, this will not include any data -with a \code{time_value} of January 8, and, depending on the amount of reporting -latency, may not include January 7 or even earlier \code{time_value}s. (If -instead the archive were to hold nowcasts instead of regular surveillance -data, then we would indeed expect data for \code{time_value} January 8. If it -were to hold forecasts, then we would expect data for \code{time_value}s after -January 8, and the sliding window would extend as far after each -\code{ref_time_value} as needed to include all such \code{time_value}s.)} - -\item{\code{ref_time_values}}{Reference time values / versions for sliding -computations; each element of this vector serves both as the anchor point -for the \code{time_value} window for the computation and the \code{max_version} -\code{as_of} which we fetch data in this window. If missing, then this will set -to a regularly-spaced sequence of values set to cover the range of -\code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will -be guessed (using the GCD of the skips between values).} - -\item{\code{time_step}}{Optional function used to define the meaning of one time -step, which if specified, overrides the default choice based on the -\code{time_value} column. This function must take a positive integer and return -an object of class \code{lubridate::period}. For example, we can use \code{time_step = lubridate::hours} in order to set the time step to be one hour (this -would only be meaningful if \code{time_value} is of class \code{POSIXct}).} - -\item{\code{new_col_name}}{String indicating the name of the new column that will -contain the derivative values. Default is "slide_value"; note that setting -\code{new_col_name} equal to an existing column name will overwrite this column.} - -\item{\code{as_list_col}}{Should the slide results be held in a list column, or be -\link[tidyr:chop]{unchopped}/\link[tidyr:unnest]{unnested}? Default is \code{FALSE}, -in which case a list object returned by \code{f} would be unnested (using -\code{\link[tidyr:unnest]{tidyr::unnest()}}), and, if the slide computations output data frames, -the names of the resulting columns are given by prepending \code{new_col_name} -to the names of the list elements.} - -\item{\code{names_sep}}{String specifying the separator to use in \code{tidyr::unnest()} -when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix -from \code{new_col_name} entirely.} - -\item{\code{all_versions}}{(Not the same as \code{all_rows} parameter of \code{epi_slide}.) If -\code{all_versions = TRUE}, then \code{f} will be passed the version history (all -\code{version <= ref_time_value}) for rows having \code{time_value} between -\code{ref_time_value - before} and \code{ref_time_value}. Otherwise, \code{f} will be -passed only the most recent \code{version} for every unique \code{time_value}. -Default is \code{FALSE}.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-epi_archive-clone}{}}} -\subsection{Method \code{clone()}}{ -The objects of this class are cloneable with this method. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{epi_archive$clone(deep = FALSE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{deep}}{Whether to make a deep clone.} -} -\if{html}{\out{
}} -} -} } diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd index 9a0a53ce..dc359a7b 100644 --- a/man/epix_as_of.Rd +++ b/man/epix_as_of.Rd @@ -34,37 +34,17 @@ Generates a snapshot in \code{epi_df} format from an \code{epi_archive} object, given version. See the \href{https://cmu-delphi.github.io/epiprocess/articles/archive.html}{archive vignette} for examples. } -\details{ -This is simply a wrapper around the \code{as_of()} method of the -\code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then: - -\if{html}{\out{
}}\preformatted{epix_as_of(x, max_version = v) -}\if{html}{\out{
}} - -is equivalent to: - -\if{html}{\out{
}}\preformatted{x$as_of(max_version = v) -}\if{html}{\out{
}} - -Mutation and aliasing: \code{epix_as_of} and \verb{$as_of} will not mutate the input -archives, but may in some edge cases alias parts of the inputs, so copy the -outputs if needed before using mutating operations like \code{data.table}'s \verb{:=} -operator. Currently, the only situation where there is potentially aliasing -is of the \code{DT} in edge cases with \code{all_versions = TRUE}, but this may change -in the future. -} \examples{ # warning message of data latency shown epix_as_of( - x = archive_cases_dv_subset, + archive_cases_dv_subset, max_version = max(archive_cases_dv_subset$DT$version) ) - range(archive_cases_dv_subset$DT$version) # 2020-06-02 -- 2021-12-01 epix_as_of( - x = archive_cases_dv_subset, + archive_cases_dv_subset, max_version = as.Date("2020-06-12") ) @@ -79,7 +59,7 @@ epix_as_of( withCallingHandlers( { epix_as_of( - x = archive_cases_dv_subset, + archive_cases_dv_subset, max_version = max(archive_cases_dv_subset$DT$version) ) }, diff --git a/man/epix_as_of2.Rd b/man/epix_as_of2.Rd deleted file mode 100644 index ac69e9a9..00000000 --- a/man/epix_as_of2.Rd +++ /dev/null @@ -1,95 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-epi_archive_new.R -\name{epix_as_of2} -\alias{epix_as_of2} -\title{Generate a snapshot from an \code{epi_archive} object} -\usage{ -epix_as_of2( - epi_archive, - max_version, - min_time_value = -Inf, - all_versions = FALSE -) -} -\arguments{ -\item{max_version}{Time value specifying the max version to permit in the -snapshot. That is, the snapshot will comprise the unique rows of the -current archive data that represent the most up-to-date signal values, as -of the specified \code{max_version} (and whose time values are at least -\code{min_time_value}.)} - -\item{min_time_value}{Time value specifying the min time value to permit in -the snapshot. Default is \code{-Inf}, which effectively means that there is no -minimum considered.} - -\item{all_versions}{If \code{all_versions = TRUE}, then the output will be in -\code{epi_archive} format, and contain rows in the specified \code{time_value} range -having \code{version <= max_version}. The resulting object will cover a -potentially narrower \code{version} and \code{time_value} range than \code{x}, depending -on user-provided arguments. Otherwise, there will be one row in the output -for the \code{max_version} of each \code{time_value}. Default is \code{FALSE}.} - -\item{x}{An \code{epi_archive} object} -} -\value{ -An \code{epi_df} object. -} -\description{ -Generates a snapshot in \code{epi_df} format from an \code{epi_archive} object, as of a -given version. See the \href{https://cmu-delphi.github.io/epiprocess/articles/archive.html}{archive vignette} for -examples. -} -\details{ -This is simply a wrapper around the \code{as_of()} method of the -\code{epi_archive} class, so if \code{x} is an \code{epi_archive} object, then: - -\if{html}{\out{
}}\preformatted{epix_as_of(x, max_version = v) -}\if{html}{\out{
}} - -is equivalent to: - -\if{html}{\out{
}}\preformatted{x$as_of(max_version = v) -}\if{html}{\out{
}} - -Mutation and aliasing: \code{epix_as_of} and \verb{$as_of} will not mutate the input -archives, but may in some edge cases alias parts of the inputs, so copy the -outputs if needed before using mutating operations like \code{data.table}'s \verb{:=} -operator. Currently, the only situation where there is potentially aliasing -is of the \code{DT} in edge cases with \code{all_versions = TRUE}, but this may change -in the future. -} -\examples{ -# warning message of data latency shown -epix_as_of2( - archive_cases_dv_subset_2, - max_version = max(archive_cases_dv_subset_2$DT$version) -) - -range(archive_cases_dv_subset_2$DT$version) # 2020-06-02 -- 2021-12-01 - -epix_as_of2( - archive_cases_dv_subset_2, - max_version = as.Date("2020-06-12") -) - -# When fetching a snapshot as of the latest version with update data in the -# archive, a warning is issued by default, as this update data might not yet -# be finalized (for example, if data versions are labeled with dates, these -# versions might be overwritten throughout the corresponding days with -# additional data or "hotfixes" of erroroneous data; when we build an archive -# based on database queries, the latest available update might still be -# subject to change, but previous versions should be finalized). We can -# muffle such warnings with the following pattern: -withCallingHandlers( - { - epix_as_of2( - archive_cases_dv_subset_2, - max_version = max(archive_cases_dv_subset_2$DT$version) - ) - }, - epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") -) -# Since R 4.0, there is a `globalCallingHandlers` function that can be used -# to globally toggle these warnings. - -} diff --git a/man/epix_fill_through_version.Rd b/man/epix_fill_through_version.Rd index d5d2c278..a6f9c360 100644 --- a/man/epix_fill_through_version.Rd +++ b/man/epix_fill_through_version.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/methods-epi_archive.R \name{epix_fill_through_version} \alias{epix_fill_through_version} -\title{\code{epi_archive} with unobserved history filled in (won't mutate, might alias)} +\title{Fill \code{epi_archive} unobserved history} \usage{ epix_fill_through_version(x, fill_versions_end, how = c("na", "locf")) } @@ -34,11 +34,3 @@ another archive. This function provides one way to approach such mismatches: pretend that we've "observed" additional versions, filling in these versions with NAs or extrapolated values. } -\details{ -'\code{epix_fill_through_version} will not mutate its \code{x} argument, but its result -might alias fields of \code{x} (e.g., mutating the result's \code{DT} might mutate -\code{x$DT}). The R6 method variant, \code{x$fill_through_version}, will mutate \code{x} to -give the result, but might reseat its fields (e.g., references to the old -\code{x$DT} might not be updated by this function or subsequent operations on -\code{x}), and returns the updated \code{x} \link[base:invisible]{invisibly}. -} diff --git a/man/epix_fill_through_version2.Rd b/man/epix_fill_through_version2.Rd deleted file mode 100644 index 7389388a..00000000 --- a/man/epix_fill_through_version2.Rd +++ /dev/null @@ -1,48 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-epi_archive_new.R -\name{epix_fill_through_version2} -\alias{epix_fill_through_version2} -\title{\code{epi_archive} with unobserved history filled in (won't mutate, might alias)} -\usage{ -epix_fill_through_version2( - epi_archive, - fill_versions_end, - how = c("na", "locf") -) -} -\arguments{ -\item{fill_versions_end}{Length-1, same class&type as \code{x$version}: the -version through which to fill in missing version history; this will be the -result's \verb{$versions_end} unless it already had a later -\verb{$versions_end}.} - -\item{how}{Optional; \code{"na"} or \code{"locf"}: \code{"na"} will fill in any missing -required version history with \code{NA}s, by inserting (if necessary) an update -immediately after the current \verb{$versions_end} that revises all -existing measurements to be \code{NA} (this is only supported for \code{version} -classes with a \code{next_after} implementation); \code{"locf"} will fill in missing -version history with the last version of each observation carried forward -(LOCF), by leaving the update \verb{$DT} alone (other \code{epi_archive} methods are -based on LOCF). Default is \code{"na"}.} - -\item{x}{An \code{epi_archive}} -} -\value{ -An \code{epi_archive} -} -\description{ -Sometimes, due to upstream data pipeline issues, we have to work with a -version history that isn't completely up to date, but with functions that -expect archives that are completely up to date, or equally as up-to-date as -another archive. This function provides one way to approach such mismatches: -pretend that we've "observed" additional versions, filling in these versions -with NAs or extrapolated values. -} -\details{ -'\code{epix_fill_through_version} will not mutate its \code{x} argument, but its result -might alias fields of \code{x} (e.g., mutating the result's \code{DT} might mutate -\code{x$DT}). The R6 method variant, \code{x$fill_through_version}, will mutate \code{x} to -give the result, but might reseat its fields (e.g., references to the old -\code{x$DT} might not be updated by this function or subsequent operations on -\code{x}), and returns the updated \code{x} \link[base:invisible]{invisibly}. -} diff --git a/man/epix_merge.Rd b/man/epix_merge.Rd index 53dea071..ea0d2444 100644 --- a/man/epix_merge.Rd +++ b/man/epix_merge.Rd @@ -30,7 +30,7 @@ use \code{min(x$versions_end, y$versions_end)} as the result's \code{versions_en and discard any rows containing update rows for later versions.} \item{compactify}{Optional; \code{TRUE}, \code{FALSE}, or \code{NULL}; should the result be -compactified? See \code{\link{as_epi_archive}} for an explanation of what this means. +compactified? See \code{as_epi_archive()} for an explanation of what this means. Default here is \code{TRUE}.} } \value{ @@ -38,21 +38,14 @@ the resulting \code{epi_archive} } \description{ Merges two \code{epi_archive}s that share a common \code{geo_value}, \code{time_value}, and -set of key columns. When they also share a common \code{versions_end}, -using \verb{$as_of} on the result should be the same as using \verb{$as_of} on \code{x} and -\code{y} individually, then performing a full join of the \code{DT}s on the non-version -key columns (potentially consolidating multiple warnings about clobberable -versions). If the \code{versions_end} values differ, the -\code{sync} parameter controls what is done. +set of key columns. When they also share a common \code{versions_end}, using +\code{epix_as_of} on the result should be the same as using \code{epix_as_of} on \code{x} +and \code{y} individually, then performing a full join of the \code{DT}s on the +non-version key columns (potentially consolidating multiple warnings about +clobberable versions). If the \code{versions_end} values differ, the \code{sync} +parameter controls what is done. } \details{ -This function, \code{\link{epix_merge}}, does not mutate its inputs and will not alias -either archive's \code{DT}, but may alias other fields; \code{x$merge} will overwrite -\code{x} with the result of the merge, reseating its \code{DT} and several other fields -(making them point to different objects), but avoiding mutation of the -contents of the old \code{DT} (only relevant if you have another reference to the -old \code{DT} in another object). - In all cases, \code{additional_metadata} will be an empty list, and \code{clobberable_versions_start} will be set to the earliest version that could be clobbered in either input archive. @@ -67,7 +60,5 @@ y <- archive_cases_dv_subset$DT \%>\% as_epi_archive(compactify = TRUE) # merge results stored in a third object: xy <- epix_merge(x, y) -# vs. mutating x to hold the merge result: -x$merge(y) } diff --git a/man/epix_merge2.Rd b/man/epix_merge2.Rd deleted file mode 100644 index 11d0aff5..00000000 --- a/man/epix_merge2.Rd +++ /dev/null @@ -1,71 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-epi_archive_new.R -\name{epix_merge2} -\alias{epix_merge2} -\title{Merge two \code{epi_archive} objects} -\usage{ -epix_merge2( - x, - y, - sync = c("forbid", "na", "locf", "truncate"), - compactify = TRUE -) -} -\arguments{ -\item{x, y}{Two \code{epi_archive} objects to join together.} - -\item{sync}{Optional; \code{"forbid"}, \code{"na"}, \code{"locf"}, or \code{"truncate"}; in the -case that \code{x$versions_end} doesn't match \code{y$versions_end}, what do we do?: -\code{"forbid"}: emit an error; "na": use \code{max(x$versions_end, y$versions_end)} -as the result's \code{versions_end}, but ensure that, if we request a snapshot -as of a version after \code{min(x$versions_end, y$versions_end)}, the -observation columns from the less up-to-date archive will be all NAs (i.e., -imagine there was an update immediately after its \code{versions_end} which -revised all observations to be \code{NA}); \code{"locf"}: use \code{max(x$versions_end, y$versions_end)} as the result's \code{versions_end}, allowing the last version -of each observation to be carried forward to extrapolate unavailable -versions for the less up-to-date input archive (i.e., imagining that in the -less up-to-date archive's data set remained unchanged between its actual -\code{versions_end} and the other archive's \code{versions_end}); or \code{"truncate"}: -use \code{min(x$versions_end, y$versions_end)} as the result's \code{versions_end}, -and discard any rows containing update rows for later versions.} - -\item{compactify}{Optional; \code{TRUE}, \code{FALSE}, or \code{NULL}; should the result be -compactified? See \code{\link{as_epi_archive}} for an explanation of what this means. -Default here is \code{TRUE}.} -} -\value{ -the resulting \code{epi_archive} -} -\description{ -Merges two \code{epi_archive}s that share a common \code{geo_value}, \code{time_value}, and -set of key columns. When they also share a common \code{versions_end}, -using \verb{$as_of} on the result should be the same as using \verb{$as_of} on \code{x} and -\code{y} individually, then performing a full join of the \code{DT}s on the non-version -key columns (potentially consolidating multiple warnings about clobberable -versions). If the \code{versions_end} values differ, the -\code{sync} parameter controls what is done. -} -\details{ -This function, \code{\link{epix_merge}}, does not mutate its inputs and will not alias -either archive's \code{DT}, but may alias other fields; \code{x$merge} will overwrite -\code{x} with the result of the merge, reseating its \code{DT} and several other fields -(making them point to different objects), but avoiding mutation of the -contents of the old \code{DT} (only relevant if you have another reference to the -old \code{DT} in another object). - -In all cases, \code{additional_metadata} will be an empty list, and -\code{clobberable_versions_start} will be set to the earliest version that could -be clobbered in either input archive. -} -\examples{ -# create two example epi_archive datasets -x <- archive_cases_dv_subset_2$DT \%>\% - dplyr::select(geo_value, time_value, version, case_rate_7d_av) \%>\% - as_epi_archive2(compactify = TRUE) -y <- archive_cases_dv_subset_2$DT \%>\% - dplyr::select(geo_value, time_value, version, percent_cli) \%>\% - as_epi_archive2(compactify = TRUE) -# merge results stored in a third object: -xy <- epix_merge2(x, y) - -} diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index 3ac55a18..c8f09594 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -1,7 +1,9 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-epi_archive.R +% Please edit documentation in R/methods-epi_archive.R, R/grouped_epi_archive.R \name{epix_slide} \alias{epix_slide} +\alias{epix_slide.epi_archive} +\alias{epix_slide.grouped_epi_archive} \title{Slide a function over variables in an \code{epi_archive} or \code{grouped_epi_archive}} \usage{ epix_slide( @@ -16,6 +18,32 @@ epix_slide( names_sep = "_", all_versions = FALSE ) + +\method{epix_slide}{epi_archive}( + x, + f, + ..., + before, + ref_time_values, + time_step, + new_col_name = "slide_value", + as_list_col = FALSE, + names_sep = "_", + all_versions = FALSE +) + +\method{epix_slide}{grouped_epi_archive}( + x, + f, + ..., + before, + ref_time_values, + time_step, + new_col_name = "slide_value", + as_list_col = FALSE, + names_sep = "_", + all_versions = FALSE +) } \arguments{ \item{x}{An \code{\link{epi_archive}} or \code{\link{grouped_epi_archive}} object. If ungrouped, @@ -64,8 +92,8 @@ January 8, and the sliding window would extend as far after each \item{ref_time_values}{Reference time values / versions for sliding computations; each element of this vector serves both as the anchor point for the \code{time_value} window for the computation and the \code{max_version} -\code{as_of} which we fetch data in this window. If missing, then this will set -to a regularly-spaced sequence of values set to cover the range of +\code{epix_as_of} which we fetch data in this window. If missing, then this will +set to a regularly-spaced sequence of values set to cover the range of \code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will be guessed (using the GCD of the skips between values).} @@ -162,31 +190,11 @@ Apart from the above distinctions, the interfaces between \code{epix_slide()} an Furthermore, the current function can be considerably slower than \code{epi_slide()}, for two reasons: (1) it must repeatedly fetch -properly-versioned snapshots from the data archive (via its \code{as_of()} -method), and (2) it performs a "manual" sliding of sorts, and does not -benefit from the highly efficient \code{slider} package. For this reason, it -should never be used in place of \code{epi_slide()}, and only used when -version-aware sliding is necessary (as it its purpose). - -Finally, this is simply a wrapper around the \code{slide()} method of the -\code{epi_archive} and \code{grouped_epi_archive} classes, so if \code{x} is an -object of either of these classes, then: - -\if{html}{\out{
}}\preformatted{epix_slide(x, new_var = comp(old_var), before = 119) -}\if{html}{\out{
}} - -is equivalent to: - -\if{html}{\out{
}}\preformatted{x$slide(new_var = comp(old_var), before = 119) -}\if{html}{\out{
}} - -Mutation and aliasing: \code{epix_slide} and \verb{$slide} will not perform in-place -mutation of the input archives on their own. In some edge cases the inputs it -feeds to the slide computations may alias parts of the input archive, so copy -the slide computation inputs if needed before using mutating operations like -\code{data.table}'s \verb{:=} operator. Similarly, in some edge cases, the output of -the slide operation may alias parts of the input archive, so similarly, make -sure to clone and/or copy appropriately before using in-place mutation. +properly-versioned snapshots from the data archive (via \code{epix_as_of()}), +and (2) it performs a "manual" sliding of sorts, and does not benefit from +the highly efficient \code{slider} package. For this reason, it should never be +used in place of \code{epi_slide()}, and only used when version-aware sliding is +necessary (as it its purpose). } \examples{ library(dplyr) diff --git a/man/epix_slide2.Rd b/man/epix_slide2.Rd deleted file mode 100644 index 8d822bc0..00000000 --- a/man/epix_slide2.Rd +++ /dev/null @@ -1,283 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-epi_archive_new.R -\name{epix_slide2} -\alias{epix_slide2} -\title{Slide a function over variables in an \code{epi_archive} or \code{grouped_epi_archive}} -\usage{ -epix_slide2( - x, - f, - ..., - before, - ref_time_values, - time_step, - new_col_name = "slide_value", - as_list_col = FALSE, - names_sep = "_", - all_versions = FALSE -) -} -\arguments{ -\item{x}{An \code{\link{epi_archive}} or \code{\link{grouped_epi_archive}} object. If ungrouped, -all data in \code{x} will be treated as part of a single data group.} - -\item{f}{Function, formula, or missing; together with \code{...} specifies the -computation to slide. To "slide" means to apply a computation over a -sliding (a.k.a. "rolling") time window for each data group. The window is -determined by the \code{before} parameter described below. One time step is -typically one day or one week; see \code{\link{epi_slide}} details for more -explanation. If a function, \code{f} must take an \code{epi_df} with the same -column names as the archive's \code{DT}, minus the \code{version} column; followed -by a one-row tibble containing the values of the grouping variables for -the associated group; followed by a reference time value, usually as a -\code{Date} object; followed by any number of named arguments. If a formula, -\code{f} can operate directly on columns accessed via \code{.x$var} or \code{.$var}, as -in \code{~ mean (.x$var)} to compute a mean of a column \code{var} for each -group-\code{ref_time_value} combination. The group key can be accessed via -\code{.y} or \code{.group_key}, and the reference time value can be accessed via -\code{.z} or \code{.ref_time_value}. If \code{f} is missing, then \code{...} will specify the -computation.} - -\item{...}{Additional arguments to pass to the function or formula specified -via \code{f}. Alternatively, if \code{f} is missing, then \code{...} is interpreted as an -expression for tidy evaluation; in addition to referring to columns -directly by name, the expression has access to \code{.data} and \code{.env} pronouns -as in \code{dplyr} verbs, and can also refer to the \code{.group_key} and -\code{.ref_time_value}. See details of \code{\link{epi_slide}}.} - -\item{before}{How far \code{before} each \code{ref_time_value} should the sliding -window extend? If provided, should be a single, non-NA, -\link[vctrs:vec_cast]{integer-compatible} number of time steps. This window -endpoint is inclusive. For example, if \code{before = 7}, and one time step is -one day, then to produce a value for a \code{ref_time_value} of January 8, we -apply the given function or formula to data (for each group present) with -\code{time_value}s from January 1 onward, as they were reported on January 8. -For typical disease surveillance sources, this will not include any data -with a \code{time_value} of January 8, and, depending on the amount of reporting -latency, may not include January 7 or even earlier \code{time_value}s. (If -instead the archive were to hold nowcasts instead of regular surveillance -data, then we would indeed expect data for \code{time_value} January 8. If it -were to hold forecasts, then we would expect data for \code{time_value}s after -January 8, and the sliding window would extend as far after each -\code{ref_time_value} as needed to include all such \code{time_value}s.)} - -\item{ref_time_values}{Reference time values / versions for sliding -computations; each element of this vector serves both as the anchor point -for the \code{time_value} window for the computation and the \code{max_version} -\code{as_of} which we fetch data in this window. If missing, then this will set -to a regularly-spaced sequence of values set to cover the range of -\code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will -be guessed (using the GCD of the skips between values).} - -\item{time_step}{Optional function used to define the meaning of one time -step, which if specified, overrides the default choice based on the -\code{time_value} column. This function must take a positive integer and return -an object of class \code{lubridate::period}. For example, we can use \code{time_step = lubridate::hours} in order to set the time step to be one hour (this -would only be meaningful if \code{time_value} is of class \code{POSIXct}).} - -\item{new_col_name}{String indicating the name of the new column that will -contain the derivative values. Default is "slide_value"; note that setting -\code{new_col_name} equal to an existing column name will overwrite this column.} - -\item{as_list_col}{Should the slide results be held in a list column, or be -\link[tidyr:chop]{unchopped}/\link[tidyr:unnest]{unnested}? Default is \code{FALSE}, -in which case a list object returned by \code{f} would be unnested (using -\code{\link[tidyr:unnest]{tidyr::unnest()}}), and, if the slide computations output data frames, -the names of the resulting columns are given by prepending \code{new_col_name} -to the names of the list elements.} - -\item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} -when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix -from \code{new_col_name} entirely.} - -\item{all_versions}{(Not the same as \code{all_rows} parameter of \code{epi_slide}.) If -\code{all_versions = TRUE}, then \code{f} will be passed the version history (all -\code{version <= ref_time_value}) for rows having \code{time_value} between -\code{ref_time_value - before} and \code{ref_time_value}. Otherwise, \code{f} will be -passed only the most recent \code{version} for every unique \code{time_value}. -Default is \code{FALSE}.} -} -\value{ -A tibble whose columns are: the grouping variables, \code{time_value}, -containing the reference time values for the slide computation, and a -column named according to the \code{new_col_name} argument, containing the slide -values. -} -\description{ -Slides a given function over variables in an \code{epi_archive} object. This -behaves similarly to \code{epi_slide()}, with the key exception that it is -version-aware: the sliding computation at any given reference time t is -performed on \strong{data that would have been available as of t}. See the -\href{https://cmu-delphi.github.io/epiprocess/articles/archive.html}{archive vignette} for -examples. -} -\details{ -A few key distinctions between the current function and \code{epi_slide()}: -\enumerate{ -\item In \code{f} functions for \code{epix_slide}, one should not assume that the input -data to contain any rows with \code{time_value} matching the computation's -\code{ref_time_value} (accessible via \verb{attributes()$metadata$as_of}); for -typical epidemiological surveillance data, observations pertaining to a -particular time period (\code{time_value}) are first reported \code{as_of} some -instant after that time period has ended. -\item \code{epix_slide()} doesn't accept an \code{after} argument; its windows extend -from \code{before} time steps before a given \code{ref_time_value} through the last -\code{time_value} available as of version \code{ref_time_value} (typically, this -won't include \code{ref_time_value} itself, as observations about a particular -time interval (e.g., day) are only published after that time interval -ends); \code{epi_slide} windows extend from \code{before} time steps before a -\code{ref_time_value} through \code{after} time steps after \code{ref_time_value}. -\item The input class and columns are similar but different: \code{epix_slide} -(with the default \code{all_versions=FALSE}) keeps all columns and the -\code{epi_df}-ness of the first argument to each computation; \code{epi_slide} only -provides the grouping variables in the second input, and will convert the -first input into a regular tibble if the grouping variables include the -essential \code{geo_value} column. (With \code{all_versions=TRUE}, \code{epix_slide} will -will provide an \code{epi_archive} rather than an \code{epi-df} to each -computation.) -\item The output class and columns are similar but different: \code{epix_slide()} -returns a tibble containing only the grouping variables, \code{time_value}, and -the new column(s) from the slide computations, whereas \code{epi_slide()} -returns an \code{epi_df} with all original variables plus the new columns from -the slide computations. (Both will mirror the grouping or ungroupedness of -their input, with one exception: \code{epi_archive}s can have trivial -(zero-variable) groupings, but these will be dropped in \code{epix_slide} -results as they are not supported by tibbles.) -\item There are no size stability checks or element/row recycling to maintain -size stability in \code{epix_slide}, unlike in \code{epi_slide}. (\code{epix_slide} is -roughly analogous to \code{\link[dplyr:group_map]{dplyr::group_modify}}, while \code{epi_slide} is roughly -analogous to \code{dplyr::mutate} followed by \code{dplyr::arrange}) This is detailed -in the "advanced" vignette. -\item \code{all_rows} is not supported in \code{epix_slide}; since the slide -computations are allowed more flexibility in their outputs than in -\code{epi_slide}, we can't guess a good representation for missing computations -for excluded group-\code{ref_time_value} pairs. -\item The \code{ref_time_values} default for \code{epix_slide} is based on making an -evenly-spaced sequence out of the \code{version}s in the \code{DT} plus the -\code{versions_end}, rather than the \code{time_value}s. -} - -Apart from the above distinctions, the interfaces between \code{epix_slide()} and -\code{epi_slide()} are the same. - -Furthermore, the current function can be considerably slower than -\code{epi_slide()}, for two reasons: (1) it must repeatedly fetch -properly-versioned snapshots from the data archive (via its \code{as_of()} -method), and (2) it performs a "manual" sliding of sorts, and does not -benefit from the highly efficient \code{slider} package. For this reason, it -should never be used in place of \code{epi_slide()}, and only used when -version-aware sliding is necessary (as it its purpose). - -Finally, this is simply a wrapper around the \code{slide()} method of the -\code{epi_archive} and \code{grouped_epi_archive} classes, so if \code{x} is an -object of either of these classes, then: - -\if{html}{\out{
}}\preformatted{epix_slide(x, new_var = comp(old_var), before = 119) -}\if{html}{\out{
}} - -is equivalent to: - -\if{html}{\out{
}}\preformatted{x$slide(new_var = comp(old_var), before = 119) -}\if{html}{\out{
}} - -Mutation and aliasing: \code{epix_slide} and \verb{$slide} will not perform in-place -mutation of the input archives on their own. In some edge cases the inputs it -feeds to the slide computations may alias parts of the input archive, so copy -the slide computation inputs if needed before using mutating operations like -\code{data.table}'s \verb{:=} operator. Similarly, in some edge cases, the output of -the slide operation may alias parts of the input archive, so similarly, make -sure to clone and/or copy appropriately before using in-place mutation. -} -\examples{ -library(dplyr) - -# Reference time points for which we want to compute slide values: -ref_time_values <- seq(as.Date("2020-06-01"), - as.Date("2020-06-15"), - by = "1 day" -) - -# A simple (but not very useful) example (see the archive vignette for a more -# realistic one): -archive_cases_dv_subset_2 \%>\% - group_by(geo_value) \%>\% - epix_slide2( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = ref_time_values, - new_col_name = "case_rate_7d_av_recent_av" - ) \%>\% - ungroup() -# We requested time windows that started 2 days before the corresponding time -# values. The actual number of `time_value`s in each computation depends on -# the reporting latency of the signal and `time_value` range covered by the -# archive (2020-06-01 -- 2021-11-30 in this example). In this case, we have -# * 0 `time_value`s, for ref time 2020-06-01 --> the result is automatically -# discarded -# * 1 `time_value`, for ref time 2020-06-02 -# * 2 `time_value`s, for the rest of the results -# * never the 3 `time_value`s we would get from `epi_slide`, since, because -# of data latency, we'll never have an observation -# `time_value == ref_time_value` as of `ref_time_value`. -# The example below shows this type of behavior in more detail. - -# Examining characteristics of the data passed to each computation with -# `all_versions=FALSE`. -archive_cases_dv_subset_2 \%>\% - group_by(geo_value) \%>\% - epix_slide2( - function(x, gk, rtv) { - tibble( - time_range = if (nrow(x) == 0L) { - "0 `time_value`s" - } else { - sprintf("\%s -- \%s", min(x$time_value), max(x$time_value)) - }, - n = nrow(x), - class1 = class(x)[[1L]] - ) - }, - before = 5, all_versions = FALSE, - ref_time_values = ref_time_values, names_sep = NULL - ) \%>\% - ungroup() \%>\% - arrange(geo_value, time_value) - -# --- Advanced: --- - -# `epix_slide` with `all_versions=FALSE` (the default) applies a -# version-unaware computation to several versions of the data. We can also -# use `all_versions=TRUE` to apply a version-*aware* computation to several -# versions of the data, again looking at characteristics of the data passed -# to each computation. In this case, each computation should expect an -# `epi_archive` containing the relevant version data: - -archive_cases_dv_subset_2 \%>\% - group_by(geo_value) \%>\% - epix_slide2( - function(x, gk, rtv) { - tibble( - versions_start = if (nrow(x$DT) == 0L) { - "NA (0 rows)" - } else { - toString(min(x$DT$version)) - }, - versions_end = x$versions_end, - time_range = if (nrow(x$DT) == 0L) { - "0 `time_value`s" - } else { - sprintf("\%s -- \%s", min(x$DT$time_value), max(x$DT$time_value)) - }, - n = nrow(x$DT), - class1 = class(x)[[1L]] - ) - }, - before = 5, all_versions = TRUE, - ref_time_values = ref_time_values, names_sep = NULL - ) \%>\% - ungroup() \%>\% - # Focus on one geo_value so we can better see the columns above: - filter(geo_value == "ca") \%>\% - select(-geo_value) - -} diff --git a/man/epix_truncate_versions_after.Rd b/man/epix_truncate_versions_after.Rd index f30be07f..c12cf9bb 100644 --- a/man/epix_truncate_versions_after.Rd +++ b/man/epix_truncate_versions_after.Rd @@ -1,31 +1,26 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-epi_archive.R, -% R/methods-epi_archive_new.R +% Please edit documentation in R/methods-epi_archive.R, R/grouped_epi_archive.R \name{epix_truncate_versions_after} \alias{epix_truncate_versions_after} +\alias{epix_truncate_versions_after.epi_archive} +\alias{epix_truncate_versions_after.grouped_epi_archive} \title{Filter an \code{epi_archive} object to keep only older versions} \usage{ epix_truncate_versions_after(x, max_version) -epix_truncate_versions_after(x, max_version) +\method{epix_truncate_versions_after}{epi_archive}(x, max_version) + +\method{epix_truncate_versions_after}{grouped_epi_archive}(x, max_version) } \arguments{ -\item{x}{An \code{epi_archive} object} +\item{x}{An \code{epi_archive} object.} -\item{max_version}{Time value specifying the max version to permit in the -filtered archive. That is, the output archive will comprise rows of the -current archive data having \code{version} less than or equal to the -specified \code{max_version}} +\item{max_version}{The latest version to include in the archive.} } \value{ -An \code{epi_archive} object - An \code{epi_archive} object } \description{ -Generates a filtered \code{epi_archive} from an \code{epi_archive} object, keeping -only rows with \code{version} falling on or before a specified date. - Generates a filtered \code{epi_archive} from an \code{epi_archive} object, keeping only rows with \code{version} falling on or before a specified date. } diff --git a/man/epix_truncate_versions_after.grouped_epi_archive2.Rd b/man/epix_truncate_versions_after.grouped_epi_archive2.Rd deleted file mode 100644 index 5fba48fb..00000000 --- a/man/epix_truncate_versions_after.grouped_epi_archive2.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/grouped_archive_new.R -\name{epix_truncate_versions_after.grouped_epi_archive2} -\alias{epix_truncate_versions_after.grouped_epi_archive2} -\title{Truncate versions after a given version, grouped} -\usage{ -\method{epix_truncate_versions_after}{grouped_epi_archive2}(grouped_epi_archive, max_version) -} -\description{ -Truncate versions after a given version, grouped -} diff --git a/man/fill_through_version.epi_archive2.Rd b/man/fill_through_version.epi_archive2.Rd deleted file mode 100644 index 48afb864..00000000 --- a/man/fill_through_version.epi_archive2.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive_new.R -\name{fill_through_version.epi_archive2} -\alias{fill_through_version.epi_archive2} -\title{Fill through version} -\usage{ -\method{fill_through_version}{epi_archive2}(epi_archive, fill_versions_end, how = c("na", "locf")) -} -\arguments{ -\item{epi_archive}{an \code{epi_archive} object} - -\item{fill_versions_end}{as in \code{\link{epix_fill_through_version}}} - -\item{how}{as in \code{\link{epix_fill_through_version}}} -} -\description{ -Fill in unobserved history using requested scheme by mutating -the given object and potentially reseating its fields. See -\code{\link{epix_fill_through_version}}, which doesn't mutate the input archive but -might alias its fields. -} diff --git a/man/group_by.epi_archive.Rd b/man/group_by.epi_archive.Rd index f157e834..782d5f3f 100644 --- a/man/group_by.epi_archive.Rd +++ b/man/group_by.epi_archive.Rd @@ -1,47 +1,26 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-epi_archive.R, R/grouped_archive_new.R, -% R/grouped_epi_archive.R +% Please edit documentation in R/archive.R, R/grouped_epi_archive.R \name{group_by.epi_archive} \alias{group_by.epi_archive} \alias{grouped_epi_archive} -\alias{group_by.grouped_epi_archive2} -\alias{group_by_drop_default.grouped_epi_archive2} -\alias{groups.grouped_epi_archive2} -\alias{ungroup.grouped_epi_archive2} -\alias{is_grouped_epi_archive2} \alias{group_by.grouped_epi_archive} +\alias{group_by_drop_default.grouped_epi_archive} \alias{groups.grouped_epi_archive} \alias{ungroup.grouped_epi_archive} \alias{is_grouped_epi_archive} -\alias{group_by_drop_default.grouped_epi_archive} \title{\code{group_by} and related methods for \code{epi_archive}, \code{grouped_epi_archive}} \usage{ \method{group_by}{epi_archive}(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) -\method{group_by}{grouped_epi_archive2}( - grouped_epi_archive, - ..., - .add = FALSE, - .drop = dplyr::group_by_drop_default(grouped_epi_archive) -) - -\method{group_by_drop_default}{grouped_epi_archive2}(grouped_epi_archive) - -\method{groups}{grouped_epi_archive2}(grouped_epi_archive) - -\method{ungroup}{grouped_epi_archive2}(grouped_epi_archive, ...) - -is_grouped_epi_archive2(x) - \method{group_by}{grouped_epi_archive}(.data, ..., .add = FALSE, .drop = dplyr::group_by_drop_default(.data)) +\method{group_by_drop_default}{grouped_epi_archive}(.tbl) + \method{groups}{grouped_epi_archive}(x) \method{ungroup}{grouped_epi_archive}(x, ...) is_grouped_epi_archive(x) - -\method{group_by_drop_default}{grouped_epi_archive}(.tbl) } \arguments{ \item{.data}{An \code{epi_archive} or \code{grouped_epi_archive}} @@ -71,12 +50,10 @@ grouped by the current grouping variables plus the variable selection from \item{.drop}{As described in \code{\link[dplyr:group_by]{dplyr::group_by}}; determines treatment of factor columns.} +\item{.tbl}{A \code{grouped_epi_archive} object.} + \item{x}{For \code{groups} or \code{ungroup}: a \code{grouped_epi_archive}; for \code{is_grouped_epi_archive}: any object} - -\item{.tbl}{(For \code{group_by_drop_default}:) an \code{epi_archive} or -\code{grouped_epi_archive} (\code{epi_archive} dispatches to the S3 default method; -\code{grouped_epi_archive} dispatches its own S3 method)} } \description{ \code{group_by} and related methods for \code{epi_archive}, \code{grouped_epi_archive} @@ -97,17 +74,6 @@ the same operations on tibbles and grouped tibbles, which will \emph{not} output Using \code{group_by} with \code{.add=FALSE} to override the existing grouping is disabled; instead, \code{ungroup} first then \code{group_by}. -Mutation and aliasing: \code{group_by} tries to use a shallow copy of the \code{DT}, -introducing column-level aliasing between its input and its result. This -doesn't follow the general model for most \code{data.table} operations, which -seems to be that, given an nonaliased (i.e., unique) pointer to a -\code{data.table} object, its pointers to its columns should also be nonaliased. -If you mutate any of the columns of either the input or result, first ensure -that it is fine if columns of the other are also mutated, but do not rely on -such behavior to occur. Additionally, never perform mutation on the key -columns at all (except for strictly increasing transformations), as this will -invalidate sortedness assumptions about the rows. - \code{group_by_drop_default} on (ungrouped) \code{epi_archive}s is expected to dispatch to \code{group_by_drop_default.default} (but there is a dedicated method for \code{grouped_epi_archive}s). diff --git a/man/group_by.epi_archive2.Rd b/man/group_by.epi_archive2.Rd deleted file mode 100644 index fa9040c3..00000000 --- a/man/group_by.epi_archive2.Rd +++ /dev/null @@ -1,147 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive_new.R -\name{group_by.epi_archive2} -\alias{group_by.epi_archive2} -\alias{grouped_epi_archive} -\title{\code{group_by} and related methods for \code{epi_archive}, \code{grouped_epi_archive}} -\usage{ -\method{group_by}{epi_archive2}( - epi_archive, - ..., - .add = FALSE, - .drop = dplyr::group_by_drop_default(epi_archive) -) -} -\arguments{ -\item{...}{Similar to \code{\link[dplyr:group_by]{dplyr::group_by}} (see "Details:" for edge cases); -\itemize{ -\item For \code{group_by}: unquoted variable name(s) or other -\link[dplyr:dplyr_data_masking]{"data masking"} expression(s). It's possible to -use \code{\link[dplyr:mutate]{dplyr::mutate}}-like syntax here to calculate new columns on which to -perform grouping, but note that, if you are regrouping an already-grouped -\code{.data} object, the calculations will be carried out ignoring such grouping -(same as \link[dplyr:group_by]{in dplyr}). -\item For \code{ungroup}: either -\itemize{ -\item empty, in order to remove the grouping and output an \code{epi_archive}; or -\item variable name(s) or other \link[dplyr:dplyr_tidy_select]{"tidy-select"} -expression(s), in order to remove the matching variables from the list of -grouping variables, and output another \code{grouped_epi_archive}. -} -}} - -\item{.add}{Boolean. If \code{FALSE}, the default, the output will be grouped by -the variable selection from \code{...} only; if \code{TRUE}, the output will be -grouped by the current grouping variables plus the variable selection from -\code{...}.} - -\item{.drop}{As described in \code{\link[dplyr:group_by]{dplyr::group_by}}; determines treatment of -factor columns.} - -\item{.data}{An \code{epi_archive} or \code{grouped_epi_archive}} - -\item{x}{For \code{groups} or \code{ungroup}: a \code{grouped_epi_archive}; for -\code{is_grouped_epi_archive}: any object} - -\item{.tbl}{(For \code{group_by_drop_default}:) an \code{epi_archive} or -\code{grouped_epi_archive} (\code{epi_archive} dispatches to the S3 default method; -\code{grouped_epi_archive} dispatches its own S3 method)} -} -\description{ -\code{group_by} and related methods for \code{epi_archive}, \code{grouped_epi_archive} -} -\details{ -To match \code{dplyr}, \code{group_by} allows "data masking" (also referred to as -"tidy evaluation") expressions \code{...}, not just column names, in a way similar -to \code{mutate}. Note that replacing or removing key columns with these -expressions is disabled. - -\code{archive \%>\% group_by()} and other expressions that group or regroup by zero -columns (indicating that all rows should be treated as part of one large -group) will output a \code{grouped_epi_archive}, in order to enable the use of -\code{grouped_epi_archive} methods on the result. This is in slight contrast to -the same operations on tibbles and grouped tibbles, which will \emph{not} output a -\code{grouped_df} in these circumstances. - -Using \code{group_by} with \code{.add=FALSE} to override the existing grouping is -disabled; instead, \code{ungroup} first then \code{group_by}. - -Mutation and aliasing: \code{group_by} tries to use a shallow copy of the \code{DT}, -introducing column-level aliasing between its input and its result. This -doesn't follow the general model for most \code{data.table} operations, which -seems to be that, given an nonaliased (i.e., unique) pointer to a -\code{data.table} object, its pointers to its columns should also be nonaliased. -If you mutate any of the columns of either the input or result, first ensure -that it is fine if columns of the other are also mutated, but do not rely on -such behavior to occur. Additionally, never perform mutation on the key -columns at all (except for strictly increasing transformations), as this will -invalidate sortedness assumptions about the rows. - -\code{group_by_drop_default} on (ungrouped) \code{epi_archive}s is expected to dispatch -to \code{group_by_drop_default.default} (but there is a dedicated method for -\code{grouped_epi_archive}s). -} -\examples{ - -grouped_archive <- archive_cases_dv_subset_2 \%>\% group_by(geo_value) - -# `print` for metadata and method listing: -grouped_archive \%>\% print() - -# The primary use for grouping is to perform a grouped `epix_slide`: - -archive_cases_dv_subset_2 \%>\% - group_by(geo_value) \%>\% - epix_slide2( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = as.Date("2020-06-11") + 0:2, - new_col_name = "case_rate_3d_av" - ) \%>\% - ungroup() - -# ----------------------------------------------------------------- - -# Advanced: some other features of dplyr grouping are implemented: - -library(dplyr) -toy_archive <- - tribble( - ~geo_value, ~age_group, ~time_value, ~version, ~value, - "us", "adult", "2000-01-01", "2000-01-02", 121, - "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) - "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) - "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) - ) \%>\% - mutate( - age_group = ordered(age_group, c("pediatric", "adult")), - time_value = as.Date(time_value), - version = as.Date(version) - ) \%>\% - as_epi_archive2(other_keys = "age_group") - -# The following are equivalent: -toy_archive \%>\% group_by(geo_value, age_group) -toy_archive \%>\% - group_by(geo_value) \%>\% - group_by(age_group, .add = TRUE) -grouping_cols <- c("geo_value", "age_group") -toy_archive \%>\% group_by(across(all_of(grouping_cols))) - -# And these are equivalent: -toy_archive \%>\% group_by(geo_value) -toy_archive \%>\% - group_by(geo_value, age_group) \%>\% - ungroup(age_group) - -# To get the grouping variable names as a `list` of `name`s (a.k.a. symbols): -toy_archive \%>\% - group_by(geo_value) \%>\% - groups() - -toy_archive \%>\% - group_by(geo_value, age_group, .drop = FALSE) \%>\% - epix_slide2(f = ~ sum(.x$value), before = 20) \%>\% - ungroup() - -} diff --git a/man/is_epi_archive.Rd b/man/is_epi_archive.Rd index 2beb3a8c..06669709 100644 --- a/man/is_epi_archive.Rd +++ b/man/is_epi_archive.Rd @@ -25,7 +25,7 @@ is_epi_archive(archive_cases_dv_subset) # TRUE # By default, grouped_epi_archives don't count as epi_archives, as they may # support a different set of operations from regular `epi_archives`. This # behavior can be controlled by `grouped_okay`. -grouped_archive <- archive_cases_dv_subset$group_by(geo_value) +grouped_archive <- archive_cases_dv_subset \%>\% group_by(geo_value) is_epi_archive(grouped_archive) # FALSE is_epi_archive(grouped_archive, grouped_okay = TRUE) # TRUE diff --git a/man/is_epi_archive2.Rd b/man/is_epi_archive2.Rd deleted file mode 100644 index df258d3e..00000000 --- a/man/is_epi_archive2.Rd +++ /dev/null @@ -1,35 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive_new.R -\name{is_epi_archive2} -\alias{is_epi_archive2} -\title{Test for \code{epi_archive} format} -\usage{ -is_epi_archive2(x, grouped_okay = FALSE) -} -\arguments{ -\item{x}{An object.} - -\item{grouped_okay}{Optional; Boolean; should a \code{grouped_epi_archive} also -count? Default is \code{FALSE}.} -} -\value{ -\code{TRUE} if the object inherits from \code{epi_archive}. -} -\description{ -Test for \code{epi_archive} format -} -\examples{ -is_epi_archive2(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive) -is_epi_archive2(archive_cases_dv_subset_2) # TRUE - -# By default, grouped_epi_archives don't count as epi_archives, as they may -# support a different set of operations from regular `epi_archives`. This -# behavior can be controlled by `grouped_okay`. -grouped_archive <- archive_cases_dv_subset_2 \%>\% group_by(geo_value) -is_epi_archive2(grouped_archive) # FALSE -is_epi_archive2(grouped_archive, grouped_okay = TRUE) # TRUE - -} -\seealso{ -\code{\link{is_grouped_epi_archive}} -} diff --git a/man/max_version_with_row_in.Rd b/man/max_version_with_row_in.Rd index 6f0d35b3..cca554fa 100644 --- a/man/max_version_with_row_in.Rd +++ b/man/max_version_with_row_in.Rd @@ -1,25 +1,18 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive.R, R/archive_new.R +% Please edit documentation in R/archive.R \name{max_version_with_row_in} \alias{max_version_with_row_in} \title{\code{max(x$version)}, with error if \code{x} has 0 rows} \usage{ -max_version_with_row_in(x) - max_version_with_row_in(x) } \arguments{ \item{x}{\code{x} argument of \code{\link{as_epi_archive}}} } \value{ -\code{max(x$version)} if it has any rows; raises error if it has 0 rows or -an \code{NA} version value - \code{max(x$version)} if it has any rows; raises error if it has 0 rows or an \code{NA} version value } \description{ -Exported to make defaults more easily copyable. - Exported to make defaults more easily copyable. } diff --git a/man/merge_epi_archive2.Rd b/man/merge_epi_archive2.Rd deleted file mode 100644 index dd1e671e..00000000 --- a/man/merge_epi_archive2.Rd +++ /dev/null @@ -1,30 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive_new.R -\name{merge_epi_archive2} -\alias{merge_epi_archive2} -\title{Merge epi archive} -\usage{ -merge_epi_archive2( - x, - y, - sync = c("forbid", "na", "locf", "truncate"), - compactify = TRUE -) -} -\arguments{ -\item{x}{as in \code{\link{epix_merge}}} - -\item{y}{as in \code{\link{epix_merge}}} - -\item{sync}{as in \code{\link{epix_merge}}} - -\item{compactify}{as in \code{\link{epix_merge}}} -} -\description{ -Merges another \code{epi_archive} with the current one, mutating the -current one by reseating its \code{DT} and several other fields, but avoiding -mutation of the old \code{DT}; returns the current archive -\link[base:invisible]{invisibly}. See \code{\link{epix_merge}} for a full description -of the non-R6-method version, which does not mutate either archive, and -does not alias either archive's \code{DT}.a -} diff --git a/man/new_epi_archive2.Rd b/man/new_epi_archive2.Rd deleted file mode 100644 index 52141190..00000000 --- a/man/new_epi_archive2.Rd +++ /dev/null @@ -1,69 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive_new.R -\name{new_epi_archive2} -\alias{new_epi_archive2} -\title{New epi archive} -\usage{ -new_epi_archive2( - x, - geo_type = NULL, - time_type = NULL, - other_keys = NULL, - additional_metadata = NULL, - compactify = NULL, - clobberable_versions_start = NA, - versions_end = NULL -) -} -\arguments{ -\item{x}{A data.frame, data.table, or tibble, with columns \code{geo_value}, -\code{time_value}, \code{version}, and then any additional number of columns.} - -\item{geo_type}{Type for the geo values. If missing, then the function will -attempt to infer it from the geo values present; if this fails, then it -will be set to "custom".} - -\item{time_type}{Type for the time values. If missing, then the function will -attempt to infer it from the time values present; if this fails, then it -will be set to "custom".} - -\item{other_keys}{Character vector specifying the names of variables in \code{x} -that should be considered key variables (in the language of \code{data.table}) -apart from "geo_value", "time_value", and "version".} - -\item{additional_metadata}{List of additional metadata to attach to the -\code{epi_archive} object. The metadata will have \code{geo_type} and \code{time_type} -fields; named entries from the passed list or will be included as well.} - -\item{compactify}{Optional; Boolean or \code{NULL}: should we remove rows that are -considered redundant for the purposes of \code{epi_archive}'s built-in methods -such as \code{as_of}? As these methods use the last version of each observation -carried forward (LOCF) to interpolate between the version data provided, -rows that don't change these LOCF results can potentially be omitted to -save space while maintaining the same behavior (with the help of the -\code{clobberable_versions_start} and \code{versions_end} fields in some edge cases). -\code{TRUE} will remove these rows, \code{FALSE} will not, and missing or \code{NULL} will -remove these rows and issue a warning. Generally, this can be set to -\code{TRUE}, but if you directly inspect or edit the fields of the \code{epi_archive} -such as its \code{DT}, or rely on redundant updates to achieve a certain -behavior of the \code{ref_time_values} default in \code{epix_slide}, you will have to -determine whether \code{compactify=TRUE} will produce the desired results. If -compactification here is removing a large proportion of the rows, this may -indicate a potential for space, time, or bandwidth savings upstream the -data pipeline, e.g., by avoiding fetching, storing, or processing these -rows of \code{x}.} - -\item{clobberable_versions_start}{Optional; as in \code{\link{as_epi_archive}}} - -\item{versions_end}{Optional; as in \code{\link{as_epi_archive}}} -} -\value{ -An \code{epi_archive} object. -} -\description{ -Creates a new \code{epi_archive} object. -} -\details{ -Refer to the documentation for \code{\link[=as_epi_archive]{as_epi_archive()}} for more information -and examples of parameter names. -} diff --git a/man/next_after.Rd b/man/next_after.Rd index 82fd3ebb..5170e8d9 100644 --- a/man/next_after.Rd +++ b/man/next_after.Rd @@ -1,23 +1,17 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive.R, R/archive_new.R +% Please edit documentation in R/archive.R \name{next_after} \alias{next_after} \title{Get the next possible value greater than \code{x} of the same type} \usage{ -next_after(x) - next_after(x) } \arguments{ \item{x}{the starting "value"(s)} } \value{ -same class, typeof, and length as \code{x} - same class, typeof, and length as \code{x} } \description{ -Get the next possible value greater than \code{x} of the same type - Get the next possible value greater than \code{x} of the same type } diff --git a/man/print.epi_archive2.Rd b/man/print.epi_archive.Rd similarity index 56% rename from man/print.epi_archive2.Rd rename to man/print.epi_archive.Rd index 0105c47e..6f823ccd 100644 --- a/man/print.epi_archive2.Rd +++ b/man/print.epi_archive.Rd @@ -1,12 +1,16 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive_new.R -\name{print.epi_archive2} -\alias{print.epi_archive2} +% Please edit documentation in R/archive.R +\name{print.epi_archive} +\alias{print.epi_archive} \title{Print information about an \code{epi_archive} object} \usage{ -\method{print}{epi_archive2}(epi_archive, class = TRUE, methods = TRUE) +\method{print}{epi_archive}(x, ..., class = TRUE, methods = TRUE) } \arguments{ +\item{x}{An \code{epi_archive} object.} + +\item{...}{Should be empty, there to satisfy the S3 generic.} + \item{class}{Boolean; whether to print the class label header} \item{methods}{Boolean; whether to print all available methods of diff --git a/man/slide.epi_archive2.Rd b/man/slide.epi_archive2.Rd deleted file mode 100644 index 54db5636..00000000 --- a/man/slide.epi_archive2.Rd +++ /dev/null @@ -1,101 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive_new.R -\name{slide.epi_archive2} -\alias{slide.epi_archive2} -\title{Slide over epi archive} -\usage{ -\method{slide}{epi_archive2}( - epi_archive, - f, - ..., - before, - ref_time_values, - time_step, - new_col_name = "slide_value", - as_list_col = FALSE, - names_sep = "_", - all_versions = FALSE -) -} -\arguments{ -\item{f}{Function, formula, or missing; together with \code{...} specifies the -computation to slide. To "slide" means to apply a computation over a -sliding (a.k.a. "rolling") time window for each data group. The window is -determined by the \code{before} parameter described below. One time step is -typically one day or one week; see \code{\link{epi_slide}} details for more -explanation. If a function, \code{f} must take an \code{epi_df} with the same -column names as the archive's \code{DT}, minus the \code{version} column; followed -by a one-row tibble containing the values of the grouping variables for -the associated group; followed by a reference time value, usually as a -\code{Date} object; followed by any number of named arguments. If a formula, -\code{f} can operate directly on columns accessed via \code{.x$var} or \code{.$var}, as -in \code{~ mean (.x$var)} to compute a mean of a column \code{var} for each -group-\code{ref_time_value} combination. The group key can be accessed via -\code{.y} or \code{.group_key}, and the reference time value can be accessed via -\code{.z} or \code{.ref_time_value}. If \code{f} is missing, then \code{...} will specify the -computation.} - -\item{...}{Additional arguments to pass to the function or formula specified -via \code{f}. Alternatively, if \code{f} is missing, then \code{...} is interpreted as an -expression for tidy evaluation; in addition to referring to columns -directly by name, the expression has access to \code{.data} and \code{.env} pronouns -as in \code{dplyr} verbs, and can also refer to the \code{.group_key} and -\code{.ref_time_value}. See details of \code{\link{epi_slide}}.} - -\item{before}{How far \code{before} each \code{ref_time_value} should the sliding -window extend? If provided, should be a single, non-NA, -\link[vctrs:vec_cast]{integer-compatible} number of time steps. This window -endpoint is inclusive. For example, if \code{before = 7}, and one time step is -one day, then to produce a value for a \code{ref_time_value} of January 8, we -apply the given function or formula to data (for each group present) with -\code{time_value}s from January 1 onward, as they were reported on January 8. -For typical disease surveillance sources, this will not include any data -with a \code{time_value} of January 8, and, depending on the amount of reporting -latency, may not include January 7 or even earlier \code{time_value}s. (If -instead the archive were to hold nowcasts instead of regular surveillance -data, then we would indeed expect data for \code{time_value} January 8. If it -were to hold forecasts, then we would expect data for \code{time_value}s after -January 8, and the sliding window would extend as far after each -\code{ref_time_value} as needed to include all such \code{time_value}s.)} - -\item{ref_time_values}{Reference time values / versions for sliding -computations; each element of this vector serves both as the anchor point -for the \code{time_value} window for the computation and the \code{max_version} -\code{as_of} which we fetch data in this window. If missing, then this will set -to a regularly-spaced sequence of values set to cover the range of -\code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will -be guessed (using the GCD of the skips between values).} - -\item{time_step}{Optional function used to define the meaning of one time -step, which if specified, overrides the default choice based on the -\code{time_value} column. This function must take a positive integer and return -an object of class \code{lubridate::period}. For example, we can use \code{time_step = lubridate::hours} in order to set the time step to be one hour (this -would only be meaningful if \code{time_value} is of class \code{POSIXct}).} - -\item{new_col_name}{String indicating the name of the new column that will -contain the derivative values. Default is "slide_value"; note that setting -\code{new_col_name} equal to an existing column name will overwrite this column.} - -\item{as_list_col}{Should the slide results be held in a list column, or be -\link[tidyr:chop]{unchopped}/\link[tidyr:unnest]{unnested}? Default is \code{FALSE}, -in which case a list object returned by \code{f} would be unnested (using -\code{\link[tidyr:unnest]{tidyr::unnest()}}), and, if the slide computations output data frames, -the names of the resulting columns are given by prepending \code{new_col_name} -to the names of the list elements.} - -\item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} -when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix -from \code{new_col_name} entirely.} - -\item{all_versions}{(Not the same as \code{all_rows} parameter of \code{epi_slide}.) If -\code{all_versions = TRUE}, then \code{f} will be passed the version history (all -\code{version <= ref_time_value}) for rows having \code{time_value} between -\code{ref_time_value - before} and \code{ref_time_value}. Otherwise, \code{f} will be -passed only the most recent \code{version} for every unique \code{time_value}. -Default is \code{FALSE}.} -} -\description{ -Slides a given function over variables in an \code{epi_archive} -object. See the documentation for the wrapper function \code{\link[=epix_slide]{epix_slide()}} for -details. The parameter descriptions below are copied from there -} diff --git a/man/slide.grouped_epi_archive2.Rd b/man/slide.grouped_epi_archive2.Rd deleted file mode 100644 index b5aac24c..00000000 --- a/man/slide.grouped_epi_archive2.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/grouped_archive_new.R -\name{slide.grouped_epi_archive2} -\alias{slide.grouped_epi_archive2} -\title{Slide over grouped epi archive} -\usage{ -\method{slide}{grouped_epi_archive2}( - grouped_epi_archive, - f, - ..., - before, - ref_time_values, - time_step, - new_col_name = "slide_value", - as_list_col = FALSE, - names_sep = "_", - all_versions = FALSE -) -} -\description{ -Slides a given function over variables in a \code{grouped_epi_archive} -object. See the documentation for the wrapper function \code{\link[=epix_slide]{epix_slide()}} for -details. -} diff --git a/man/truncate_versions_after.epi_archive2.Rd b/man/truncate_versions_after.epi_archive2.Rd deleted file mode 100644 index 08ae40d4..00000000 --- a/man/truncate_versions_after.epi_archive2.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive_new.R -\name{truncate_versions_after.epi_archive2} -\alias{truncate_versions_after.epi_archive2} -\title{Truncate versions after} -\usage{ -\method{truncate_versions_after}{epi_archive2}(epi_archive, max_version) -} -\arguments{ -\item{epi_archive}{as in \code{\link{epix_truncate_versions_after}}} - -\item{max_version}{as in \code{\link{epix_truncate_versions_after}}} -} -\description{ -Filter to keep only older versions, mutating the archive by -potentially reseating but not mutating some fields. \code{DT} is likely, but not -guaranteed, to be copied. Returns the mutated archive -\link[base:invisible]{invisibly}. -} diff --git a/man/truncate_versions_after.grouped_epi_archive2.Rd b/man/truncate_versions_after.grouped_epi_archive2.Rd deleted file mode 100644 index 7c25950f..00000000 --- a/man/truncate_versions_after.grouped_epi_archive2.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/grouped_archive_new.R -\name{truncate_versions_after.grouped_epi_archive2} -\alias{truncate_versions_after.grouped_epi_archive2} -\title{Truncate versions after a given version, grouped} -\usage{ -\method{truncate_versions_after}{grouped_epi_archive2}(grouped_epi_archive, max_version) -} -\arguments{ -\item{max_version}{as in \code{\link{epix_truncate_versions_after}}} - -\item{x}{as in \code{\link{epix_truncate_versions_after}}} -} -\description{ -Filter to keep only older versions by mutating the underlying -\code{epi_archive} using \verb{$truncate_versions_after}. Returns the mutated -\code{grouped_epi_archive} \link[base:invisible]{invisibly}. -} diff --git a/tests/testthat/test-archive-version-bounds.R b/tests/testthat/test-archive-version-bounds.R index 47506152..d78167d7 100644 --- a/tests/testthat/test-archive-version-bounds.R +++ b/tests/testthat/test-archive-version-bounds.R @@ -111,12 +111,12 @@ test_that("archive version bounds args work as intended", { ) expect_error(as_epi_archive(update_tbl, versions_end = NA), regexp = "must have the same classes") ea_default <- as_epi_archive(update_tbl) - ea_default$as_of(measurement_date + 4L) + ea_default %>% epix_as_of(measurement_date + 4L) expect_warning( regexp = NA, - ea_default$as_of(measurement_date + 5L), - class = "epiprocess__snapshot_as_of_clobberable_version" + ea_default %>% epix_as_of(measurement_date + 5L), + class = "epiprocess__snapshot_epix_as_of_clobberable_version" ) - ea_default$as_of(measurement_date + 5L) - expect_error(ea_default$as_of(measurement_date + 6L)) + ea_default %>% epix_as_of(measurement_date + 5L) + expect_error(ea_default %>% epix_as_of(measurement_date + 6L)) }) diff --git a/tests/testthat/test-archive_new.R b/tests/testthat/test-archive_new.R deleted file mode 100644 index 98f708d7..00000000 --- a/tests/testthat/test-archive_new.R +++ /dev/null @@ -1,173 +0,0 @@ -library(dplyr) - -test_that("first input must be a data.frame", { - expect_error(as_epi_archive2(c(1, 2, 3), compactify = FALSE), - regexp = "Must be of type 'data.frame'." - ) -}) - -dt <- archive_cases_dv_subset_2$DT - -test_that("data.frame must contain geo_value, time_value and version columns", { - expect_error(as_epi_archive2(select(dt, -geo_value), compactify = FALSE), - regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." - ) - expect_error(as_epi_archive2(select(dt, -time_value), compactify = FALSE), - regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." - ) - expect_error(as_epi_archive2(select(dt, -version), compactify = FALSE), - regexp = "Columns `geo_value`, `time_value`, and `version` must be present in `x`." - ) -}) - -test_that("other_keys can only contain names of the data.frame columns", { - expect_error(as_epi_archive2(dt, other_keys = "xyz", compactify = FALSE), - regexp = "`other_keys` must be contained in the column names of `x`." - ) - expect_error(as_epi_archive2(dt, other_keys = "percent_cli", compactify = FALSE), NA) -}) - -test_that("other_keys cannot contain names geo_value, time_value or version", { - expect_error(as_epi_archive2(dt, other_keys = "geo_value", compactify = FALSE), - regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." - ) - expect_error(as_epi_archive2(dt, other_keys = "time_value", compactify = FALSE), - regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." - ) - expect_error(as_epi_archive2(dt, other_keys = "version", compactify = FALSE), - regexp = "`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\"." - ) -}) - -test_that("Warning thrown when other_metadata contains overlapping names with geo_type or time_type fields", { - expect_warning(as_epi_archive2(dt, additional_metadata = list(geo_type = 1), compactify = FALSE), - regexp = "`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\"." - ) - expect_warning(as_epi_archive2(dt, additional_metadata = list(time_type = 1), compactify = FALSE), - regexp = "`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\"." - ) -}) - -test_that("epi_archives are correctly instantiated with a variety of data types", { - # Data frame - df <- data.frame( - geo_value = "ca", - time_value = as.Date("2020-01-01"), - version = as.Date("2020-01-01") + 0:19, - value = 1:20 - ) - - ea1 <- as_epi_archive2(df, compactify = FALSE) - expect_equal(key(ea1$DT), c("geo_value", "time_value", "version")) - expect_equal(ea1$additional_metadata, list()) - - ea2 <- as_epi_archive2(df, other_keys = "value", additional_metadata = list(value = df$value), compactify = FALSE) - expect_equal(key(ea2$DT), c("geo_value", "time_value", "value", "version")) - expect_equal(ea2$additional_metadata, list(value = df$value)) - - # Tibble - tib <- tibble::tibble(df, code = "x") - - ea3 <- as_epi_archive2(tib, compactify = FALSE) - expect_equal(key(ea3$DT), c("geo_value", "time_value", "version")) - expect_equal(ea3$additional_metadata, list()) - - ea4 <- as_epi_archive2(tib, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) - expect_equal(key(ea4$DT), c("geo_value", "time_value", "code", "version")) - expect_equal(ea4$additional_metadata, list(value = df$value)) - - # Keyed data.table - kdt <- data.table::data.table( - geo_value = "ca", - time_value = as.Date("2020-01-01"), - version = as.Date("2020-01-01") + 0:19, - value = 1:20, - code = "CA", - key = "code" - ) - - ea5 <- as_epi_archive2(kdt, compactify = FALSE) - # Key from data.table isn't absorbed when as_epi_archive2 is used - expect_equal(key(ea5$DT), c("geo_value", "time_value", "version")) - expect_equal(ea5$additional_metadata, list()) - - ea6 <- as_epi_archive2(kdt, other_keys = "value", additional_metadata = list(value = df$value), compactify = FALSE) - # Mismatched keys, but the one from as_epi_archive2 overrides - expect_equal(key(ea6$DT), c("geo_value", "time_value", "value", "version")) - expect_equal(ea6$additional_metadata, list(value = df$value)) - - # Unkeyed data.table - udt <- data.table::data.table( - geo_value = "ca", - time_value = as.Date("2020-01-01"), - version = as.Date("2020-01-01") + 0:19, - value = 1:20, - code = "CA" - ) - - ea7 <- as_epi_archive2(udt, compactify = FALSE) - expect_equal(key(ea7$DT), c("geo_value", "time_value", "version")) - expect_equal(ea7$additional_metadata, list()) - - ea8 <- as_epi_archive2(udt, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) - expect_equal(key(ea8$DT), c("geo_value", "time_value", "code", "version")) - expect_equal(ea8$additional_metadata, list(value = df$value)) - - # epi_df - edf1 <- jhu_csse_daily_subset %>% - select(geo_value, time_value, cases) %>% - mutate(version = max(time_value), code = "USA") - - ea9 <- as_epi_archive2(edf1, compactify = FALSE) - expect_equal(key(ea9$DT), c("geo_value", "time_value", "version")) - expect_equal(ea9$additional_metadata, list()) - - ea10 <- as_epi_archive2(edf1, other_keys = "code", additional_metadata = list(value = df$value), compactify = FALSE) - expect_equal(key(ea10$DT), c("geo_value", "time_value", "code", "version")) - expect_equal(ea10$additional_metadata, list(value = df$value)) - - # Keyed epi_df - edf2 <- data.frame( - geo_value = "al", - time_value = rep(as.Date("2020-01-01") + 0:9, 2), - version = c( - rep(as.Date("2020-01-25"), 10), - rep(as.Date("2020-01-26"), 10) - ), - cases = 1:20, - misc = "USA" - ) %>% - as_epi_df(additional_metadata = list(other_keys = "misc")) - - ea11 <- as_epi_archive2(edf2, compactify = FALSE) - expect_equal(key(ea11$DT), c("geo_value", "time_value", "version")) - expect_equal(ea11$additional_metadata, list()) - - ea12 <- as_epi_archive2(edf2, other_keys = "misc", additional_metadata = list(value = df$misc), compactify = FALSE) - expect_equal(key(ea12$DT), c("geo_value", "time_value", "misc", "version")) - expect_equal(ea12$additional_metadata, list(value = df$misc)) -}) - -test_that("`epi_archive` rejects nonunique keys", { - toy_update_tbl <- - tibble::tribble( - ~geo_value, ~age_group, ~time_value, ~version, ~value, - "us", "adult", "2000-01-01", "2000-01-02", 121, - "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) - "us", "adult", "2000-01-02", "2000-01-03", 130, - "us", "pediatric", "2000-01-01", "2000-01-02", 5 - ) %>% - mutate( - age_group = ordered(age_group, c("pediatric", "adult")), - time_value = as.Date(time_value), - version = as.Date(version) - ) - expect_error( - as_epi_archive2(toy_update_tbl), - class = "epiprocess__epi_archive_requires_unique_key" - ) - expect_error( - regexp = NA, - as_epi_archive2(toy_update_tbl, other_keys = "age_group"), - ) -}) diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 58e97884..263d67b7 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -2,7 +2,7 @@ library(epiprocess) library(data.table) library(dplyr) -dt <- archive_cases_dv_subset_2$DT +dt <- archive_cases_dv_subset$DT dt <- filter(dt, geo_value == "ca") %>% filter(version <= "2020-06-15") %>% select(-case_rate_7d_av) @@ -84,8 +84,8 @@ test_that("as_of produces the same results with compactify=TRUE as with compacti # Row 22, an LOCF row corresponding to the latest version, is omitted in # ea_true latest_version <- max(ea_false$DT$version) - as_of_true <- ea_true$as_of(latest_version) - as_of_false <- ea_false$as_of(latest_version) + as_of_true <- epix_as_of(ea_true, latest_version) + as_of_false <- epix_as_of(ea_false, latest_version) expect_identical(as_of_true, as_of_false) }) diff --git a/tests/testthat/test-compactify_new.R b/tests/testthat/test-compactify_new.R deleted file mode 100644 index cd53913d..00000000 --- a/tests/testthat/test-compactify_new.R +++ /dev/null @@ -1,110 +0,0 @@ -library(epiprocess) -library(data.table) -library(dplyr) - -dt <- archive_cases_dv_subset_2$DT -dt <- filter(dt, geo_value == "ca") %>% - filter(version <= "2020-06-15") %>% - select(-case_rate_7d_av) - -test_that("Input for compactify must be NULL or a boolean", { - expect_error(as_epi_archive2(dt, compactify = "no")) -}) - -dt$percent_cli <- c(1:80) -dt$case_rate <- c(1:80) - -row_replace <- function(dt, row, x, y) { - # (This way of "replacing" elements appears to use copy-on-write even though - # we are working with a data.table.) - dt[row, 4] <- x - dt[row, 5] <- y - dt -} - -# Note that compactify is working on version-wise LOCF (last version of each -# observation carried forward) - -# Rows 1 should not be eliminated even if NA -dt <- row_replace(dt, 1, NA, NA) # Not LOCF - -# NOTE! We are assuming that there are no NA's in geo_value, time_value, -# and version. Even though compactify may erroneously remove the first row -# if it has all NA's, we are not testing this behaviour for now as this dataset -# has problems beyond the scope of this test - -# Rows 11 and 12 correspond to different time_values -dt <- row_replace(dt, 12, 11, 11) # Not LOCF - -# Rows 20 and 21 only differ in version -dt <- row_replace(dt, 21, 20, 20) # LOCF - -# Rows 21 and 22 only differ in version -dt <- row_replace(dt, 22, 20, 20) # LOCF - -# Row 39 comprises the first NA's -dt <- row_replace(dt, 39, NA, NA) # Not LOCF - -# Row 40 has two NA's, just like its lag, row 39 -dt <- row_replace(dt, 40, NA, NA) # LOCF - -# Row 62's values already exist in row 15, but row 15 is not a preceding row -dt <- row_replace(dt, 62, 15, 15) # Not LOCF - -# Row 73 only has one value carried over -dt <- row_replace(dt, 74, 73, 74) # Not LOCF - -dt_true <- as_tibble(as_epi_archive2(dt, compactify = TRUE)$DT) -dt_false <- as_tibble(as_epi_archive2(dt, compactify = FALSE)$DT) -dt_null <- suppressWarnings(as_tibble(as_epi_archive2(dt, compactify = NULL)$DT)) - -test_that("Warning for LOCF with compactify as NULL", { - expect_warning(as_epi_archive2(dt, compactify = NULL)) -}) - -test_that("No warning when there is no LOCF", { - expect_warning(as_epi_archive2(dt[1:5], compactify = NULL), NA) -}) - -test_that("LOCF values are ignored with compactify=FALSE", { - expect_identical(nrow(dt), nrow(dt_false)) -}) - -test_that("LOCF values are taken out with compactify=TRUE", { - dt_test <- as_tibble(as_epi_archive2(dt[-c(21, 22, 40), ], compactify = FALSE)$DT) - - expect_identical(dt_true, dt_null) - expect_identical(dt_null, dt_test) -}) - -test_that("as_of produces the same results with compactify=TRUE as with compactify=FALSE", { - ea_true <- as_epi_archive2(dt, compactify = TRUE) - ea_false <- as_epi_archive2(dt, compactify = FALSE) - - # Row 22, an LOCF row corresponding to the latest version, is omitted in - # ea_true - latest_version <- max(ea_false$DT$version) - as_of_true <- as_of(ea_true, latest_version) - as_of_false <- as_of(ea_false, latest_version) - - expect_identical(as_of_true, as_of_false) -}) - -test_that("compactify does not alter the default clobberable and observed version bounds", { - x <- tibble::tibble( - geo_value = "geo1", - time_value = as.Date("2000-01-01"), - version = as.Date("2000-01-01") + 1:5, - value = 42L - ) - ea_true <- as_epi_archive2(x, compactify = TRUE) - ea_false <- as_epi_archive2(x, compactify = FALSE) - # We say that we base the bounds on the user's `x` arg. We might mess up or - # change our minds and base things on the `DT` field (or a temporary `DT` - # variable, post-compactify) instead. Check that this test would trigger - # in that case: - expect_true(max(ea_true$DT$version) != max(ea_false$DT$version)) - # The actual test: - expect_identical(ea_true$clobberable_versions_start, ea_false$clobberable_versions_start) - expect_identical(ea_true$versions_end, ea_false$versions_end) -}) diff --git a/tests/testthat/test-deprecations.R b/tests/testthat/test-deprecations.R index 5be3824e..7d29149b 100644 --- a/tests/testthat/test-deprecations.R +++ b/tests/testthat/test-deprecations.R @@ -5,8 +5,8 @@ test_that("epix_slide group_by= deprecation works", { class = "epiprocess__epix_slide_group_by_parameter_deprecated" ) expect_error( - archive_cases_dv_subset$ - slide(function(...) {}, before = 2L, group_by = c()), + archive_cases_dv_subset %>% + epix_slide(function(...) {}, before = 2L, group_by = c()), class = "epiprocess__epix_slide_group_by_parameter_deprecated" ) expect_error( @@ -16,9 +16,9 @@ test_that("epix_slide group_by= deprecation works", { class = "epiprocess__epix_slide_group_by_parameter_deprecated" ) expect_error( - archive_cases_dv_subset$ - group_by(geo_value)$ - slide(function(...) {}, before = 2L, group_by = c()), + archive_cases_dv_subset %>% + group_by(geo_value) %>% + epix_slide(function(...) {}, before = 2L, group_by = c()), class = "epiprocess__epix_slide_group_by_parameter_deprecated" ) # @@ -28,8 +28,8 @@ test_that("epix_slide group_by= deprecation works", { class = "epiprocess__epix_slide_all_rows_parameter_deprecated" ) expect_error( - archive_cases_dv_subset$ - slide(function(...) {}, before = 2L, all_rows = TRUE), + archive_cases_dv_subset %>% + epix_slide(function(...) {}, before = 2L, all_rows = TRUE), class = "epiprocess__epix_slide_all_rows_parameter_deprecated" ) expect_error( @@ -39,9 +39,9 @@ test_that("epix_slide group_by= deprecation works", { class = "epiprocess__epix_slide_all_rows_parameter_deprecated" ) expect_error( - archive_cases_dv_subset$ - group_by(geo_value)$ - slide(function(...) {}, before = 2L, all_rows = TRUE), + archive_cases_dv_subset %>% + group_by(geo_value) %>% + epix_slide(function(...) {}, before = 2L, all_rows = TRUE), class = "epiprocess__epix_slide_all_rows_parameter_deprecated" ) }) diff --git a/tests/testthat/test-epix_fill_through_version.R b/tests/testthat/test-epix_fill_through_version.R index 9ba847fa..89bb4804 100644 --- a/tests/testthat/test-epix_fill_through_version.R +++ b/tests/testthat/test-epix_fill_through_version.R @@ -7,15 +7,7 @@ test_that("epix_fill_through_version mirrors input when it is sufficiently up to ea_trivial_fill_na1 <- epix_fill_through_version(ea_orig, some_earlier_observed_version, "na") ea_trivial_fill_na2 <- epix_fill_through_version(ea_orig, ea_orig$versions_end, "na") ea_trivial_fill_locf <- epix_fill_through_version(ea_orig, some_earlier_observed_version, "locf") - # Below, we want R6 objects to be compared based on contents rather than - # addresses. We appear to get this with `expect_identical` in `testthat` - # edition 3, which is based on `waldo::compare` rather than `base::identical`; - # `waldo::compare` in waldo >=0.3.1 appears (as of 0.4.0) to compare R6 - # objects by contents rather than address (in a way that is tested but maybe - # not guaranteed via user docs). Use `testthat::local_edition` to ensure we - # use testthat edition 3 here (use `testthat::` to prevent ambiguity with - # `readr`). - testthat::local_edition(3) + expect_identical(ea_orig, ea_trivial_fill_na1) expect_identical(ea_orig, ea_trivial_fill_na2) expect_identical(ea_orig, ea_trivial_fill_locf) @@ -33,20 +25,17 @@ test_that("epix_fill_through_version can extend observed versions, gives expecte ea_fill_na <- epix_fill_through_version(ea_orig, later_unobserved_version, "na") ea_fill_locf <- epix_fill_through_version(ea_orig, later_unobserved_version, "locf") - # We use testthat edition 3 features here, passing `ignore_attr` to - # `waldo::compare`. Ensure we are using edition 3: - testthat::local_edition(3) withCallingHandlers( { expect_identical(ea_fill_na$versions_end, later_unobserved_version) - expect_identical(tibble::as_tibble(ea_fill_na$as_of(first_unobserved_version)), + expect_identical(tibble::as_tibble(epix_as_of(ea_fill_na, first_unobserved_version)), tibble::tibble(geo_value = "g1", time_value = as.Date("2020-01-01") + 0:1, value = rep(NA_integer_, 2L)), ignore_attr = TRUE ) expect_identical(ea_fill_locf$versions_end, later_unobserved_version) expect_identical( - ea_fill_locf$as_of(first_unobserved_version), - ea_fill_locf$as_of(ea_orig$versions_end) %>% + epix_as_of(ea_fill_locf, first_unobserved_version), + epix_as_of(ea_fill_locf, ea_orig$versions_end) %>% { attr(., "metadata")$as_of <- first_unobserved_version . @@ -69,54 +58,31 @@ test_that("epix_fill_through_version does not mutate x", { # doesn't seem sufficient to trigger) as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) )) { - # We want to perform a strict comparison of the contents of `ea_orig` before - # and `ea_orig` after. `clone` + `expect_identical` based on waldo would - # sort of work, but we might want something stricter. `as.list` + - # `identical` plus a check of the DT seems to do the trick. - ea_orig_before_as_list <- as.list(ea_orig) + ea_orig_before <- clone(ea_orig) ea_orig_dt_before_copy <- data.table::copy(ea_orig$DT) some_unobserved_version <- 8L - # + ea_fill_na <- epix_fill_through_version(ea_orig, some_unobserved_version, "na") - ea_orig_after_as_list <- as.list(ea_orig) - # use identical, not expect_identical, for the R6-as-list test; latter isn't as strict - expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) + expect_identical(ea_orig_before, ea_orig) expect_identical(ea_orig_dt_before_copy, ea_orig$DT) - # + ea_fill_locf <- epix_fill_through_version(ea_orig, some_unobserved_version, "locf") - ea_orig_after_as_list <- as.list(ea_orig) - expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) + expect_identical(ea_orig_before, ea_orig) expect_identical(ea_orig_dt_before_copy, ea_orig$DT) } }) -test_that("x$fill_through_version mutates x (if needed)", { - ea <- as_epi_archive(data.table::data.table( - geo_value = "g1", time_value = as.Date("2020-01-01"), - version = 1:5, value = 1:5 - )) - # We want the contents to change in a substantial way that makes waldo compare - # different (if the contents need to change). - ea_before_copies_as_list <- lapply(ea, data.table::copy) - some_unobserved_version <- 8L - ea$fill_through_version(some_unobserved_version, "na") - ea_after_copies_as_list <- lapply(ea, data.table::copy) - expect_failure(expect_identical(ea_before_copies_as_list, ea_after_copies_as_list)) -}) - -test_that("{epix_,$}fill_through_version return with expected visibility", { +test_that("epix_fill_through_version return with expected visibility", { ea <- as_epi_archive(data.table::data.table( geo_value = "g1", time_value = as.Date("2020-01-01"), version = 1:5, value = 1:5 )) expect_true(withVisible(epix_fill_through_version(ea, 10L, "na"))[["visible"]]) - expect_false(withVisible(ea$fill_through_version(15L, "na"))[["visible"]]) }) test_that("epix_fill_through_version returns same key & doesn't mutate old DT or its key", { ea <- as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) - old_dt <- ea$DT - old_dt_copy <- data.table::copy(old_dt) + old_dt_copy <- data.table::copy(ea$DT) old_key <- data.table::key(ea$DT) expect_identical(data.table::key(epix_fill_through_version(ea, 5L, "na")$DT), old_key) expect_identical(data.table::key(epix_fill_through_version(ea, 5L, "locf")$DT), old_key) diff --git a/tests/testthat/test-epix_fill_through_version_new.R b/tests/testthat/test-epix_fill_through_version_new.R deleted file mode 100644 index 2b76a851..00000000 --- a/tests/testthat/test-epix_fill_through_version_new.R +++ /dev/null @@ -1,109 +0,0 @@ -test_that("epix_fill_through_version2 mirrors input when it is sufficiently up to date", { - ea_orig <- as_epi_archive2(data.table::data.table( - geo_value = "g1", time_value = as.Date("2020-01-01"), - version = 1:5, value = 1:5 - )) - some_earlier_observed_version <- 2L - ea_trivial_fill_na1 <- epix_fill_through_version2(ea_orig, some_earlier_observed_version, "na") - ea_trivial_fill_na2 <- epix_fill_through_version2(ea_orig, ea_orig$versions_end, "na") - ea_trivial_fill_locf <- epix_fill_through_version2(ea_orig, some_earlier_observed_version, "locf") - # Below, we want R6 objects to be compared based on contents rather than - # addresses. We appear to get this with `expect_identical` in `testthat` - # edition 3, which is based on `waldo::compare` rather than `base::identical`; - # `waldo::compare` in waldo >=0.3.1 appears (as of 0.4.0) to compare R6 - # objects by contents rather than address (in a way that is tested but maybe - # not guaranteed via user docs). Use `testthat::local_edition` to ensure we - # use testthat edition 3 here (use `testthat::` to prevent ambiguity with - # `readr`). - testthat::local_edition(3) - expect_identical(ea_orig, ea_trivial_fill_na1) - expect_identical(ea_orig, ea_trivial_fill_na2) - expect_identical(ea_orig, ea_trivial_fill_locf) -}) - -test_that("epix_fill_through_version2 can extend observed versions, gives expected `as_of`s", { - ea_orig <- as_epi_archive2(data.table::data.table( - geo_value = "g1", - time_value = as.Date("2020-01-01") + c(rep(0L, 5L), 1L), - version = c(1:5, 2L), - value = 1:6 - )) - first_unobserved_version <- 6L - later_unobserved_version <- 10L - ea_fill_na <- epix_fill_through_version2(ea_orig, later_unobserved_version, "na") - ea_fill_locf <- epix_fill_through_version2(ea_orig, later_unobserved_version, "locf") - - # We use testthat edition 3 features here, passing `ignore_attr` to - # `waldo::compare`. Ensure we are using edition 3: - testthat::local_edition(3) - withCallingHandlers( - { - expect_identical(ea_fill_na$versions_end, later_unobserved_version) - expect_identical(tibble::as_tibble(as_of(ea_fill_na, first_unobserved_version)), - tibble::tibble(geo_value = "g1", time_value = as.Date("2020-01-01") + 0:1, value = rep(NA_integer_, 2L)), - ignore_attr = TRUE - ) - expect_identical(ea_fill_locf$versions_end, later_unobserved_version) - expect_identical( - as_of(ea_fill_locf, first_unobserved_version), - as_of(ea_fill_locf, ea_orig$versions_end) %>% - { - attr(., "metadata")$as_of <- first_unobserved_version - . - } - ) - }, - epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") - ) -}) - -test_that("epix_fill_through_version2 does not mutate x", { - for (ea_orig in list( - # vanilla case - as_epi_archive2(data.table::data.table( - geo_value = "g1", time_value = as.Date("2020-01-01"), - version = 1:5, value = 1:5 - )), - # data.table unique yielding original DT by reference special case (maybe - # having only 1 row is the trigger? having no revisions of initial values - # doesn't seem sufficient to trigger) - as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) - )) { - # We want to perform a strict comparison of the contents of `ea_orig` before - # and `ea_orig` after. `clone` + `expect_identical` based on waldo would - # sort of work, but we might want something stricter. `as.list` + - # `identical` plus a check of the DT seems to do the trick. - ea_orig_before_as_list <- as.list(ea_orig) - ea_orig_DT_before_copy <- data.table::copy(ea_orig$DT) - some_unobserved_version <- 8L - # - ea_fill_na <- epix_fill_through_version2(ea_orig, some_unobserved_version, "na") - ea_orig_after_as_list <- as.list(ea_orig) - # use identical, not expect_identical, for the R6-as-list test; latter isn't as strict - expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) - expect_identical(ea_orig_DT_before_copy, ea_orig$DT) - # - ea_fill_locf <- epix_fill_through_version2(ea_orig, some_unobserved_version, "locf") - ea_orig_after_as_list <- as.list(ea_orig) - expect_true(identical(ea_orig_before_as_list, ea_orig_after_as_list)) - expect_identical(ea_orig_DT_before_copy, ea_orig$DT) - } -}) - -test_that("epix_fill_through_version return with expected visibility", { - ea <- as_epi_archive(data.table::data.table( - geo_value = "g1", time_value = as.Date("2020-01-01"), - version = 1:5, value = 1:5 - )) - expect_true(withVisible(epix_fill_through_version(ea, 10L, "na"))[["visible"]]) -}) - -test_that("epix_fill_through_version2 returns same key & doesn't mutate old DT or its key", { - ea <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) - old_DT <- ea$DT - old_DT_copy <- data.table::copy(old_DT) - old_key <- data.table::key(ea$DT) - expect_identical(data.table::key(epix_fill_through_version2(ea, 5L, "na")$DT), old_key) - expect_identical(data.table::key(epix_fill_through_version2(ea, 5L, "locf")$DT), old_key) - expect_identical(data.table::key(ea$DT), old_key) -}) diff --git a/tests/testthat/test-epix_merge.R b/tests/testthat/test-epix_merge.R index 181aee28..9bcc7d67 100644 --- a/tests/testthat/test-epix_merge.R +++ b/tests/testthat/test-epix_merge.R @@ -1,5 +1,6 @@ test_that("epix_merge requires forbids on invalid `y`", { - ea <- archive_cases_dv_subset$clone() + ea <- archive_cases_dv_subset %>% + clone() expect_error(epix_merge(ea, data.frame(x = 1))) }) @@ -58,9 +59,7 @@ test_that("epix_merge merges and carries forward updates properly", { dplyr::mutate(dplyr::across(c(x_value, y_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) ) ) - # We rely on testthat edition 3 expect_identical using waldo, not identical. See - # test-epix_fill_through_version.R comments for details. - testthat::local_edition(3) + expect_identical(xy, xy_expected) }) diff --git a/tests/testthat/test-epix_merge_new.R b/tests/testthat/test-epix_merge_new.R deleted file mode 100644 index 10041dbb..00000000 --- a/tests/testthat/test-epix_merge_new.R +++ /dev/null @@ -1,226 +0,0 @@ -test_that("epix_merge requires forbids on invalid `y`", { - ea <- archive_cases_dv_subset_2 %>% - clone() - expect_error(epix_merge2(ea, data.frame(x = 1))) -}) - -test_that("epix_merge merges and carries forward updates properly", { - x <- as_epi_archive2( - data.table::as.data.table( - tibble::tribble( - ~geo_value, ~time_value, ~version, ~x_value, - # same version set for x and y - "g1", 1L, 1:3, paste0("XA", 1:3), - # versions of x surround those of y + this measurement has - # max update version beyond some others - "g1", 2L, 1:5, paste0("XB", 1:5), - # mirror case - "g1", 3L, 2L, paste0("XC", 2L), - # x has 1 version, y has 0 - "g1", 4L, 1L, paste0("XD", 1L), - # non-NA values that should be carried forward - # (version-wise LOCF) in other versions, plus NAs that - # should (similarly) be carried forward as NA (latter - # wouldn't work with an ordinary merge + post-processing - # with `data.table::nafill`) - "g1", 6L, c(1L, 3L, 5L), paste0("XE", c(1L, NA, 5L)) - ) %>% - tidyr::unchop(c(version, x_value)) %>% - dplyr::mutate(dplyr::across(c(x_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) - ) - ) - y <- as_epi_archive2( - data.table::as.data.table( - tibble::tribble( - ~geo_value, ~time_value, ~version, ~y_value, - "g1", 1L, 1:3, paste0("YA", 1:3), - "g1", 2L, 2L, paste0("YB", 2L), - "g1", 3L, 1:5, paste0("YC", 1:5), - "g1", 5L, 1L, paste0("YD", 1L), - "g1", 6L, 1:5, paste0("YE", 1:5), - ) %>% - tidyr::unchop(c(version, y_value)) %>% - dplyr::mutate(dplyr::across(c(y_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) - ) - ) - xy <- epix_merge2(x, y) - xy_expected <- as_epi_archive2( - data.table::as.data.table( - tibble::tribble( - ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - "g1", 1L, 1:3, paste0("XA", 1:3), paste0("YA", 1:3), - "g1", 2L, 1:5, paste0("XB", 1:5), paste0("YB", c(NA, 2L, 2L, 2L, 2L)), - "g1", 3L, 1:5, paste0("XC", c(NA, 2L, 2L, 2L, 2L)), paste0("YC", 1:5), - "g1", 4L, 1L, paste0("XD", 1L), paste0("YD", NA), - "g1", 5L, 1L, paste0("XD", NA), paste0("YD", 1L), - "g1", 6L, 1:5, paste0("XE", c(1L, 1L, NA, NA, 5L)), paste0("YE", 1:5), - ) %>% - tidyr::unchop(c(version, x_value, y_value)) %>% - dplyr::mutate(dplyr::across(c(x_value, y_value), ~ dplyr::if_else(grepl("NA", .x), NA_character_, .x))) - ) - ) - # We rely on testthat edition 3 expect_identical using waldo, not identical. See - # test-epix_fill_through_version.R comments for details. - testthat::local_edition(3) - expect_identical(xy, xy_expected) -}) - -test_that("epix_merge forbids and warns on metadata and naming issues", { - expect_error( - epix_merge2( - as_epi_archive2(tibble::tibble(geo_value = "tx", time_value = 1L, version = 1L, x_value = 1L)), - as_epi_archive2(tibble::tibble(geo_value = "us", time_value = 1L, version = 5L, y_value = 2L)) - ), - regexp = "must have the same.*geo_type" - ) - expect_error( - epix_merge2( - as_epi_archive2(tibble::tibble(geo_value = "pa", time_value = 1L, version = 1L, x_value = 1L)), - as_epi_archive2(tibble::tibble(geo_value = "pa", time_value = as.Date("2020-01-01"), version = 5L, y_value = 2L)) - ), - regexp = "must have the same.*time_type" - ) - expect_error( - epix_merge2( - as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 1L)), - as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 2L)) - ), - regexp = "overlapping.*names" - ) - expect_warning( - epix_merge2( - as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L), - additional_metadata = list("updates_fetched" = lubridate::ymd_hms("2022-05-01 16:00:00", tz = "UTC")) - ), - as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L)) - ), - regexp = "x\\$additional_metadata", - class = "epiprocess__epix_merge_ignores_additional_metadata" - ) - expect_warning( - epix_merge2( - as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L)), - as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L), - additional_metadata = list("updates_fetched" = lubridate::ymd_hms("2022-05-01 16:00:00", tz = "UTC")) - ) - ), - regexp = "y\\$additional_metadata", - class = "epiprocess__epix_merge_ignores_additional_metadata" - ) -}) - -# use `local` to prevent accidentally using the x, y, xy bindings here -# elsewhere, while allowing reuse across a couple tests -local({ - x <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 1L), - clobberable_versions_start = 1L, versions_end = 10L - ) - y <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 2L), - clobberable_versions_start = 3L, versions_end = 10L - ) - xy <- epix_merge2(x, y) - test_that("epix_merge considers partially-clobberable row to be clobberable", { - expect_identical(xy$clobberable_versions_start, 1L) - }) - test_that("epix_merge result uses versions_end metadata not max version val", { - expect_identical(xy$versions_end, 10L) - }) -}) - -local({ - x <- as_epi_archive2( - tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 10L), - clobberable_versions_start = 1L, - versions_end = 3L - ) - y <- as_epi_archive2( - tibble::tibble(geo_value = 1L, time_value = 1L, version = 5L, y_value = 20L), - clobberable_versions_start = 1L - ) - test_that('epix_merge forbids on sync default or "forbid"', { - expect_error(epix_merge2(x, y), - class = "epiprocess__epix_merge_unresolved_sync" - ) - expect_error(epix_merge2(x, y, sync = "forbid"), - class = "epiprocess__epix_merge_unresolved_sync" - ) - }) - test_that('epix_merge sync="na" works', { - expect_equal( - epix_merge2(x, y, sync = "na"), - as_epi_archive2(tibble::tribble( - ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet - 1L, 1L, 4L, NA_integer_, NA_integer_, # NA-ing out x, y not observed yet - 1L, 1L, 5L, NA_integer_, 20L, # x still NA, y updated - # (we should not have a y vals -> NA update here; version 5 should be - # the `versions_end` of the result) - ), clobberable_versions_start = 1L) - ) - }) - test_that('epix_merge sync="locf" works', { - expect_equal( - epix_merge2(x, y, sync = "locf"), - as_epi_archive2(tibble::tribble( - ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet - 1L, 1L, 5L, 10L, 20L, # x LOCF'd, y updated - ), clobberable_versions_start = 1L) - ) - }) - test_that('epix_merge sync="truncate" works', { - expect_equal( - epix_merge2(x, y, sync = "truncate"), - as_epi_archive2(tibble::tribble( - ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - 1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet - # y's update beyond x's last update has been truncated - ), clobberable_versions_start = 1L, versions_end = 3L) - ) - }) - x_no_conflict <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, x_value = 10L)) - y_no_conflict <- as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, y_value = 20L)) - xy_no_conflict_expected <- as_epi_archive2(tibble::tribble( - ~geo_value, ~time_value, ~version, ~x_value, ~y_value, - 1L, 1L, 1L, 10L, 20L, # x updated, y not observed yet - )) - test_that('epix_merge sync="forbid" on no-conflict works', { - expect_equal( - epix_merge2(x_no_conflict, y_no_conflict, sync = "forbid"), - xy_no_conflict_expected - ) - }) - test_that('epix_merge sync="na" on no-conflict works', { - # This test is the main reason for these no-conflict tests. We want to make - # sure that we don't add an unnecessary NA-ing-out version beyond a common - # versions_end. - expect_equal( - epix_merge2(x_no_conflict, y_no_conflict, sync = "na"), - xy_no_conflict_expected - ) - }) - test_that('epix_merge sync="locf" on no-conflict works', { - expect_equal( - epix_merge2(x_no_conflict, y_no_conflict, sync = "locf"), - xy_no_conflict_expected - ) - }) - test_that('epix_merge sync="truncate" on no-conflict works', { - expect_equal( - epix_merge2(x_no_conflict, y_no_conflict, sync = "truncate"), - xy_no_conflict_expected - ) - }) -}) - - -test_that('epix_merge sync="na" balks if do not know next_after', { - expect_error( - epix_merge2( - as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = as.POSIXct(as.Date("2020-01-01")), x_value = 10L)), - as_epi_archive2(tibble::tibble(geo_value = 1L, time_value = 1L, version = as.POSIXct(as.Date("2020-01-02")), y_value = 20L)), - sync = "na" - ), - regexp = "no applicable method.*next_after" - ) -}) diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index 07f0e5bf..b7a3e946 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -39,15 +39,13 @@ test_that("epix_slide works as intended", { expect_identical(xx1, xx2) # * - xx3 <- ( - xx - $group_by(dplyr::across(dplyr::all_of("geo_value"))) - $slide( + xx3 <- xx %>% + group_by(dplyr::across(dplyr::all_of("geo_value"))) %>% + epix_slide( f = ~ sum(.x$binary), before = 2, new_col_name = "sum_binary" ) - ) expect_identical(xx1, xx3) # This and * imply xx2 and xx3 are identical @@ -95,15 +93,13 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { expect_identical(xx_dfrow1, xx_dfrow2) # * - xx_dfrow3 <- ( - xx - $group_by(dplyr::across(dplyr::all_of("geo_value"))) - $slide( + xx_dfrow3 <- xx %>% + group_by(dplyr::across(dplyr::all_of("geo_value"))) %>% + epix_slide( f = ~ data.frame(bin_sum = sum(.x$binary)), before = 2, as_list_col = TRUE ) - ) expect_identical(xx_dfrow1, xx_dfrow3) # This and * Imply xx_dfrow2 and xx_dfrow3 are identical @@ -179,39 +175,40 @@ test_that("epix_slide works as intended with `as_list_col=TRUE`", { test_that("epix_slide `before` validation works", { expect_error( - xx$slide(f = ~ sum(.x$binary)), + xx %>% epix_slide(f = ~ sum(.x$binary)), "`before` is required" ) expect_error( - xx$slide(f = ~ sum(.x$binary), before = NA), + xx %>% epix_slide(f = ~ sum(.x$binary), before = NA), "Assertion on 'before' failed: May not be NA" ) expect_error( - xx$slide(f = ~ sum(.x$binary), before = -1), + xx %>% epix_slide(f = ~ sum(.x$binary), before = -1), "Assertion on 'before' failed: Element 1 is not >= 0" ) - expect_error(xx$slide(f = ~ sum(.x$binary), before = 1.5), + expect_error( + xx %>% epix_slide(f = ~ sum(.x$binary), before = 1.5), regexp = "before", class = "vctrs_error_incompatible_type" ) # We might want to allow this at some point (issue #219): - expect_error(xx$slide(f = ~ sum(.x$binary), before = Inf), + expect_error( + xx %>% epix_slide(f = ~ sum(.x$binary), before = Inf), regexp = "before", class = "vctrs_error_incompatible_type" ) - # (wrapper shouldn't introduce a value:) - expect_error(epix_slide(xx, f = ~ sum(.x$binary)), "`before` is required") + expect_error(xx %>% epix_slide(f = ~ sum(.x$binary)), "`before` is required") # These `before` values should be accepted: expect_error( - xx$slide(f = ~ sum(.x$binary), before = 0), + xx %>% epix_slide(f = ~ sum(.x$binary), before = 0), NA ) expect_error( - xx$slide(f = ~ sum(.x$binary), before = 2L), + xx %>% epix_slide(f = ~ sum(.x$binary), before = 2L), NA ) expect_error( - xx$slide(f = ~ sum(.x$binary), before = 365000), + xx %>% epix_slide(f = ~ sum(.x$binary), before = 365000), NA ) }) @@ -251,12 +248,14 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss # # (S3 group_by behavior for this case is the `reference_by_modulus`) expect_identical( - ea$group_by(modulus)$slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), + ea %>% + group_by(modulus) %>% + epix_slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), reference_by_modulus ) # test the .data pronoun behavior: @@ -271,12 +270,14 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss reference_by_modulus ) expect_identical( - ea$group_by(.data$modulus)$slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), + ea %>% + group_by(.data$modulus) %>% + epix_slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), reference_by_modulus ) # test the passing across-all-of-string-literal behavior: @@ -291,12 +292,14 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss reference_by_modulus ) expect_identical( - ea$group_by(across(all_of("modulus")))$slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), + ea %>% + group_by(across(all_of("modulus"))) %>% + epix_slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), reference_by_modulus ) # test the passing-across-all-of-string-var behavior: @@ -312,12 +315,14 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss reference_by_modulus ) expect_identical( - ea$group_by(dplyr::across(tidyselect::all_of(my_group_by)))$slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), + ea %>% + group_by(dplyr::across(tidyselect::all_of(my_group_by))) %>% + epix_slide( + f = ~ mean(.x$case_rate_7d_av), + before = 2, + ref_time_values = time_values, + new_col_name = "case_rate_3d_av" + ), reference_by_modulus ) # test the default behavior (default in this case should just be grouping by neither): @@ -332,7 +337,7 @@ test_that("quosure passing issue in epix_slide is resolved + other potential iss reference_by_neither ) expect_identical( - ea$slide( + ea %>% epix_slide( f = ~ mean(.x$case_rate_7d_av), before = 2, ref_time_values = time_values, @@ -356,12 +361,6 @@ ea <- tibble::tribble( as_epi_archive() test_that("epix_slide with all_versions option has access to all older versions", { - library(data.table) - # Make sure we're using testthat edition 3, where `expect_identical` doesn't - # actually mean `base::identical` but something more content-based using - # `waldo` package: - testthat::local_edition(3) - slide_fn <- function(x, gk, rtv) { return(tibble( n_versions = length(unique(x$DT$version)), @@ -371,8 +370,8 @@ test_that("epix_slide with all_versions option has access to all older versions" )) } - ea_orig_mirror <- ea$clone(deep = TRUE) - ea_orig_mirror$DT <- copy(ea_orig_mirror$DT) + ea_orig_mirror <- ea %>% clone() + ea_orig_mirror$DT <- data.table::copy(ea_orig_mirror$DT) result1 <- ea %>% group_by() %>% @@ -397,16 +396,14 @@ test_that("epix_slide with all_versions option has access to all older versions" expect_identical(result1, result2) # * - result3 <- ( - ea - $group_by() - $slide( + result3 <- ea %>% + group_by() %>% + epix_slide( f = slide_fn, before = 10^3, names_sep = NULL, all_versions = TRUE ) - ) expect_identical(result1, result3) # This and * Imply result2 and result3 are identical @@ -427,7 +424,7 @@ test_that("epix_slide with all_versions option has access to all older versions" group_by() %>% epix_slide( data = slide_fn( - .data$clone(), # hack to convert from pronoun back to archive + .x, stop("slide_fn doesn't use group key, no need to prepare it") ), before = 10^3, @@ -436,14 +433,10 @@ test_that("epix_slide with all_versions option has access to all older versions" ) expect_identical(result1, result5) # This and * Imply result2 and result5 are identical - expect_identical(ea, ea_orig_mirror) # We shouldn't have mutated ea }) -test_that("as_of and epix_slide with long enough window are compatible", { - library(data.table) - testthat::local_edition(3) - +test_that("epix_as_of and epix_slide with long enough window are compatible", { # For all_versions = FALSE: f1 <- function(x, gk, rtv) { @@ -454,8 +447,8 @@ test_that("as_of and epix_slide with long enough window are compatible", { ref_time_value1 <- 5 expect_identical( - ea$as_of(ref_time_value1) %>% f1() %>% mutate(time_value = ref_time_value1, .before = 1L), - ea$slide(f1, before = 1000L, ref_time_values = ref_time_value1, names_sep = NULL) + ea %>% epix_as_of(ref_time_value1) %>% f1() %>% mutate(time_value = ref_time_value1, .before = 1L), + ea %>% epix_slide(f1, before = 1000L, ref_time_values = ref_time_value1, names_sep = NULL) ) # For all_versions = TRUE: @@ -475,18 +468,24 @@ test_that("as_of and epix_slide with long enough window are compatible", { ) %>% # assess as nowcast: unnest(data) %>% - inner_join(x$as_of(x$versions_end), by = setdiff(key(x$DT), c("version"))) %>% + inner_join( + x %>% epix_as_of(x$versions_end), + by = setdiff(key(x$DT), c("version")) + ) %>% summarize(mean_abs_delta = mean(abs(binary - lag1))) } ref_time_value2 <- 5 expect_identical( - ea$as_of(ref_time_value2, all_versions = TRUE) %>% f2() %>% mutate(time_value = ref_time_value2, .before = 1L), - ea$slide(f2, before = 1000L, ref_time_values = ref_time_value2, all_versions = TRUE, names_sep = NULL) + ea %>% + epix_as_of(ref_time_value2, all_versions = TRUE) %>% + f2() %>% + mutate(time_value = ref_time_value2, .before = 1L), + ea %>% epix_slide(f2, before = 1000L, ref_time_values = ref_time_value2, all_versions = TRUE, names_sep = NULL) ) # Test the same sort of thing when grouping by geo in an archive with multiple geos. - ea_multigeo <- ea$clone() + ea_multigeo <- ea %>% clone() ea_multigeo$DT <- rbind( ea_multigeo$DT, copy(ea_multigeo$DT)[, geo_value := "y"][, binary := -binary][] @@ -547,16 +546,14 @@ test_that("epix_slide with all_versions option works as intended", { expect_identical(xx1, xx2) # * - xx3 <- ( - xx - $group_by(dplyr::across(dplyr::all_of("geo_value"))) - $slide( + xx3 <- xx %>% + group_by(dplyr::across(dplyr::all_of("geo_value"))) %>% + epix_slide( f = ~ sum(.x$DT$binary), before = 2, new_col_name = "sum_binary", all_versions = TRUE ) - ) expect_identical(xx1, xx3) # This and * Imply xx2 and xx3 are identical }) @@ -568,7 +565,7 @@ test_that("epix_slide with all_versions option works as intended", { # back depending on the decisions there: # # test_that("`epix_slide` uses `versions_end` as a resulting `epi_df`'s `as_of`", { -# ea_updated_stale = ea$clone() +# ea_updated_stale = ea %>% clone() # ea_updated_stale$versions_end <- ea_updated_stale$versions_end + 3 # (dbl) # # # expect_identical( @@ -811,7 +808,6 @@ test_that("`epix_slide` can access objects inside of helper functions", { helper(archive_cases_dv_subset, as.Date("2021-01-01")), NA ) - expect_error( helper(xx, 3L), NA diff --git a/tests/testthat/test-epix_slide_new.R b/tests/testthat/test-epix_slide_new.R deleted file mode 100644 index 49ef5e41..00000000 --- a/tests/testthat/test-epix_slide_new.R +++ /dev/null @@ -1,810 +0,0 @@ -library(dplyr) - -test_that("epix_slide2 only works on an epi_archive", { - expect_error(epix_slide2(data.frame(x = 1))) -}) - -x <- tibble::tribble( - ~version, ~time_value, ~binary, - 4, c(1:3), 2^(1:3), - 5, c(1:2, 4), 2^(4:6), - 6, c(1:2, 4:5), 2^(7:10), - 7, 2:6, 2^(11:15) -) %>% - tidyr::unnest(c(time_value, binary)) - -xx <- bind_cols(geo_value = rep("x", 15), x) %>% - as_epi_archive2() - -test_that("epix_slide2 works as intended", { - xx1 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - f = ~ sum(.x$binary), - before = 2, - new_col_name = "sum_binary" - ) - - xx2 <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - sum_binary = c( - 2^3 + 2^2, - 2^6 + 2^3, - 2^10 + 2^9, - 2^15 + 2^14 - ) - ) %>% - group_by(geo_value) - - expect_identical(xx1, xx2) # * - - xx3 <- xx %>% - group_by( - dplyr::across(dplyr::all_of("geo_value")) - ) %>% - slide( - f = ~ sum(.x$binary), - before = 2, - new_col_name = "sum_binary" - ) - - expect_identical(xx1, xx3) # This and * imply xx2 and xx3 are identical - - # function interface - xx4 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2(f = function(x, gk, rtv) { - tibble::tibble(sum_binary = sum(x$binary)) - }, before = 2, names_sep = NULL) - - expect_identical(xx1, xx4) - - # tidyeval interface - xx5 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - sum_binary = sum(binary), - before = 2 - ) - - expect_identical(xx1, xx5) -}) - -test_that("epix_slide2 works as intended with `as_list_col=TRUE`", { - xx_dfrow1 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - f = ~ data.frame(bin_sum = sum(.x$binary)), - before = 2, - as_list_col = TRUE - ) - - xx_dfrow2 <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - slide_value = - c( - 2^3 + 2^2, - 2^6 + 2^3, - 2^10 + 2^9, - 2^15 + 2^14 - ) %>% - purrr::map(~ data.frame(bin_sum = .x)) - ) %>% - group_by(geo_value) - - expect_identical(xx_dfrow1, xx_dfrow2) # * - - xx_dfrow3 <- xx %>% - group_by(dplyr::across(dplyr::all_of("geo_value"))) %>% - slide( - f = ~ data.frame(bin_sum = sum(.x$binary)), - before = 2, - as_list_col = TRUE - ) - - expect_identical(xx_dfrow1, xx_dfrow3) # This and * Imply xx_dfrow2 and xx_dfrow3 are identical - - xx_df1 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - f = ~ data.frame(bin = .x$binary), - before = 2, - as_list_col = TRUE - ) - - xx_df2 <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - slide_value = - list( - c(2^3, 2^2), - c(2^6, 2^3), - c(2^10, 2^9), - c(2^15, 2^14) - ) %>% - purrr::map(~ data.frame(bin = rev(.x))) - ) %>% - group_by(geo_value) - - expect_identical(xx_df1, xx_df2) - - xx_scalar1 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - f = ~ sum(.x$binary), - before = 2, - as_list_col = TRUE - ) - - xx_scalar2 <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - slide_value = - list( - 2^3 + 2^2, - 2^6 + 2^3, - 2^10 + 2^9, - 2^15 + 2^14 - ) - ) %>% - group_by(geo_value) - - expect_identical(xx_scalar1, xx_scalar2) - - xx_vec1 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - f = ~ .x$binary, - before = 2, - as_list_col = TRUE - ) - - xx_vec2 <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - slide_value = - list( - c(2^3, 2^2), - c(2^6, 2^3), - c(2^10, 2^9), - c(2^15, 2^14) - ) %>% - purrr::map(rev) - ) %>% - group_by(geo_value) - - expect_identical(xx_vec1, xx_vec2) -}) - -test_that("epix_slide2 `before` validation works", { - expect_error( - slide(xx, f = ~ sum(.x$binary)), - "`before` is required" - ) - expect_error( - slide(xx, f = ~ sum(.x$binary), before = NA), - "Assertion on 'before' failed: May not be NA" - ) - expect_error( - slide(xx, f = ~ sum(.x$binary), before = -1), - "Assertion on 'before' failed: Element 1 is not >= 0" - ) - expect_error(slide(xx, f = ~ sum(.x$binary), before = 1.5), - regexp = "before", - class = "vctrs_error_incompatible_type" - ) - # We might want to allow this at some point (issue #219): - expect_error(slide(xx, f = ~ sum(.x$binary), before = Inf), - regexp = "before", - class = "vctrs_error_incompatible_type" - ) - # (wrapper shouldn't introduce a value:) - expect_error(epix_slide2(xx, f = ~ sum(.x$binary)), "`before` is required") - # These `before` values should be accepted: - expect_error( - slide(xx, f = ~ sum(.x$binary), before = 0), - NA - ) - expect_error( - slide(xx, f = ~ sum(.x$binary), before = 2L), - NA - ) - expect_error( - slide(xx, f = ~ sum(.x$binary), before = 365000), - NA - ) -}) - -test_that("quosure passing issue in epix_slide2 is resolved + other potential issues", { - # (First part adapted from @examples) - time_values <- seq(as.Date("2020-06-01"), - as.Date("2020-06-02"), - by = "1 day" - ) - # We only have one non-version, non-time key in the example archive. Add - # another so that we don't accidentally pass tests due to accidentally - # matching the default grouping. - ea <- as_epi_archive2( - archive_cases_dv_subset$DT %>% - dplyr::mutate(modulus = seq_len(nrow(.)) %% 5L), - other_keys = "modulus", - compactify = TRUE - ) - reference_by_modulus <- ea %>% - group_by(modulus) %>% - epix_slide2( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ) - reference_by_neither <- ea %>% - group_by() %>% - epix_slide2( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ) - # test the passing-something-that-must-be-enquosed behavior: - # - # (S3 group_by behavior for this case is the `reference_by_modulus`) - expect_identical( - ea %>% group_by(modulus) %>% slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), - reference_by_modulus - ) - # test the .data pronoun behavior: - expect_identical( - epix_slide2( - x = ea %>% group_by(.data$modulus), - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), - reference_by_modulus - ) - expect_identical( - ea %>% group_by(.data$modulus) %>% slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), - reference_by_modulus - ) - # test the passing across-all-of-string-literal behavior: - expect_identical( - epix_slide2( - x = ea %>% group_by(dplyr::across(all_of("modulus"))), - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), - reference_by_modulus - ) - expect_identical( - ea %>% group_by(across(all_of("modulus"))) %>% slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), - reference_by_modulus - ) - # test the passing-across-all-of-string-var behavior: - my_group_by <- "modulus" - expect_identical( - epix_slide2( - x = ea %>% group_by(dplyr::across(tidyselect::all_of(my_group_by))), - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), - reference_by_modulus - ) - expect_identical( - ea %>% group_by(dplyr::across(tidyselect::all_of(my_group_by))) %>% slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), - reference_by_modulus - ) - # test the default behavior (default in this case should just be grouping by neither): - expect_identical( - epix_slide2( - x = ea, - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), - reference_by_neither - ) - expect_identical( - ea %>% slide( - f = ~ mean(.x$case_rate_7d_av), - before = 2, - ref_time_values = time_values, - new_col_name = "case_rate_3d_av" - ), - reference_by_neither - ) -}) - -ea <- tibble::tribble( - ~version, ~time_value, ~binary, - 2, 1:1, 2^(1:1), - 3, 1:2, 2^(2:1), - 4, 1:3, 2^(3:1), - 5, 1:4, 2^(4:1), - 6, 1:5, 2^(5:1), - 7, 1:6, 2^(6:1) -) %>% - tidyr::unnest(c(time_value, binary)) %>% - mutate(geo_value = "x") %>% - as_epi_archive2() - -test_that("epix_slide2 with all_versions option has access to all older versions", { - library(data.table) - # Make sure we're using testthat edition 3, where `expect_identical` doesn't - # actually mean `base::identical` but something more content-based using - # `waldo` package: - testthat::local_edition(3) - - slide_fn <- function(x, gk, rtv) { - return(tibble( - n_versions = length(unique(x$DT$version)), - n_row = nrow(x$DT), - dt_class1 = class(x$DT)[[1L]], - dt_key = list(key(x$DT)) - )) - } - - ea_orig_mirror <- ea %>% clone(deep = TRUE) - ea_orig_mirror$DT <- copy(ea_orig_mirror$DT) - - result1 <- ea %>% - group_by() %>% - epix_slide2( - f = slide_fn, - before = 10^3, - names_sep = NULL, - all_versions = TRUE - ) - - expect_true(inherits(result1, "tbl_df")) - - result2 <- tibble::tribble( - ~time_value, ~n_versions, ~n_row, ~dt_class1, ~dt_key, - 2, 1L, sum(1:1), "data.table", key(ea$DT), - 3, 2L, sum(1:2), "data.table", key(ea$DT), - 4, 3L, sum(1:3), "data.table", key(ea$DT), - 5, 4L, sum(1:4), "data.table", key(ea$DT), - 6, 5L, sum(1:5), "data.table", key(ea$DT), - 7, 6L, sum(1:6), "data.table", key(ea$DT), - ) - - expect_identical(result1, result2) # * - - result3 <- ea %>% - group_by() %>% - slide( - f = slide_fn, - before = 10^3, - names_sep = NULL, - all_versions = TRUE - ) - - expect_identical(result1, result3) # This and * Imply result2 and result3 are identical - - # formula interface - result4 <- ea %>% - group_by() %>% - epix_slide2( - f = ~ slide_fn(.x, .y), - before = 10^3, - names_sep = NULL, - all_versions = TRUE - ) - - expect_identical(result1, result4) # This and * Imply result2 and result4 are identical - - # tidyeval interface - result5 <- ea %>% - group_by() %>% - epix_slide2( - data = slide_fn( - .x, - stop("slide_fn doesn't use group key, no need to prepare it") - ), - before = 10^3, - names_sep = NULL, - all_versions = TRUE - ) - - expect_identical(result1, result5) # This and * Imply result2 and result5 are identical - expect_identical(ea, ea_orig_mirror) # We shouldn't have mutated ea -}) - -test_that("as_of and epix_slide2 with long enough window are compatible", { - library(data.table) - testthat::local_edition(3) - - # For all_versions = FALSE: - - f1 <- function(x, gk, rtv) { - tibble( - diff_mean = mean(diff(x$binary)) - ) - } - ref_time_value1 <- 5 - - expect_identical( - ea %>% as_of(ref_time_value1) %>% f1() %>% mutate(time_value = ref_time_value1, .before = 1L), - ea %>% slide(f1, before = 1000L, ref_time_values = ref_time_value1, names_sep = NULL) - ) - - # For all_versions = TRUE: - - f2 <- function(x, gk, rtv) { - x %>% - # extract time&version-lag-1 data: - epix_slide2( - function(subx, subgk, rtv) { - tibble(data = list( - subx %>% - filter(time_value == attr(subx, "metadata")$as_of - 1) %>% - rename(real_time_value = time_value, lag1 = binary) - )) - }, - before = 1, names_sep = NULL - ) %>% - # assess as nowcast: - unnest(data) %>% - inner_join(x %>% as_of(x$versions_end), by = setdiff(key(x$DT), c("version"))) %>% - summarize(mean_abs_delta = mean(abs(binary - lag1))) - } - ref_time_value2 <- 5 - - expect_identical( - ea %>% as_of(ref_time_value2, all_versions = TRUE) %>% f2() %>% mutate(time_value = ref_time_value2, .before = 1L), - ea %>% slide(f2, before = 1000L, ref_time_values = ref_time_value2, all_versions = TRUE, names_sep = NULL) - ) - - # Test the same sort of thing when grouping by geo in an archive with multiple geos. - ea_multigeo <- ea %>% clone() - ea_multigeo$DT <- rbind( - ea_multigeo$DT, - copy(ea_multigeo$DT)[, geo_value := "y"][, binary := -binary][] - ) - setkeyv(ea_multigeo$DT, key(ea$DT)) - - expect_identical( - ea_multigeo %>% - group_by(geo_value) %>% - epix_slide2(f2, before = 1000L, ref_time_values = ref_time_value2, all_versions = TRUE, names_sep = NULL) %>% - filter(geo_value == "x"), - ea %>% # using `ea` here is like filtering `ea_multigeo` to `geo_value=="x"` - epix_as_of2(ref_time_value2, all_versions = TRUE) %>% - f2() %>% - transmute(geo_value = "x", time_value = ref_time_value2, mean_abs_delta) %>% - group_by(geo_value) - ) -}) - -test_that("epix_slide2 `f` is passed an ungrouped `epi_archive` when `all_versions=TRUE`", { - slide_fn <- function(x, gk, rtv) { - expect_true(is_epi_archive2(x)) - return(NA) - } - - ea %>% - group_by() %>% - epix_slide2( - f = slide_fn, - before = 1, - ref_time_values = 5, - new_col_name = "out", - all_versions = TRUE - ) -}) - -test_that("epix_slide2 with all_versions option works as intended", { - xx1 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - f = ~ sum(.x$DT$binary), - before = 2, - new_col_name = "sum_binary", - all_versions = TRUE - ) - - xx2 <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - sum_binary = c( - 2^3 + 2^2, - 2^6 + 2^3, - 2^10 + 2^9 + 2^6, - 2^15 + 2^14 + 2^10 - ) - ) %>% - group_by(geo_value) - - expect_identical(xx1, xx2) # * - - xx3 <- xx %>% - group_by(dplyr::across(dplyr::all_of("geo_value"))) %>% - slide( - f = ~ sum(.x$DT$binary), - before = 2, - new_col_name = "sum_binary", - all_versions = TRUE - ) - - expect_identical(xx1, xx3) # This and * Imply xx2 and xx3 are identical -}) - -# XXX currently, we're using a stopgap measure of having `epix_slide2` always -# output a (grouped/ungrouped) tibble while we think about the class, columns, -# and attributes of `epix_slide2` output more carefully. We might bring this test -# back depending on the decisions there: -# -# test_that("`epix_slide2` uses `versions_end` as a resulting `epi_df`'s `as_of`", { -# ea_updated_stale = ea$clone() -# ea_updated_stale$versions_end <- ea_updated_stale$versions_end + 3 # (dbl) -# # -# expect_identical( -# ea_updated_stale %>% -# group_by(geo_value) %>% -# epix_slide2(~ slice_head(.x, n = 1L), before = 10L) %>% -# ungroup() %>% -# attr("metadata") %>% -# .$as_of, -# 10 -# ) -# }) - -test_that("epix_slide2 works with 0-row computation outputs", { - epix_slide_empty <- function(ea, ...) { - ea %>% - epix_slide2(before = 5L, ..., function(x, gk, rtv) { - tibble::tibble() - }) - } - expect_identical( - ea %>% - epix_slide_empty(), - tibble::tibble( - time_value = ea$DT$version[integer(0)] - ) - ) - expect_identical( - ea %>% - group_by(geo_value) %>% - epix_slide_empty(), - tibble::tibble( - geo_value = ea$DT$geo_value[integer(0)], - time_value = ea$DT$version[integer(0)] - ) %>% - # new_epi_df(geo_type = ea$geo_type, time_type = ea$time_type, - # as_of = ea$versions_end) %>% - group_by(geo_value) - ) - # with `all_versions=TRUE`, we have something similar but never get an - # `epi_df`: - expect_identical( - ea %>% - epix_slide_empty(all_versions = TRUE), - tibble::tibble( - time_value = ea$DT$version[integer(0)] - ) - ) - expect_identical( - ea %>% - group_by(geo_value) %>% - epix_slide_empty(all_versions = TRUE), - tibble::tibble( - geo_value = ea$DT$geo_value[integer(0)], - time_value = ea$DT$version[integer(0)] - ) %>% - group_by(geo_value) - ) -}) - -# test_that("epix_slide grouped by geo can produce `epi_df` output", { -# # This is a characterization test. Not sure we actually want this behavior; -# # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 -# expect_identical( -# ea %>% -# group_by(geo_value) %>% -# epix_slide(before = 5L, function(x,g) { -# tibble::tibble(value = 42) -# }, names_sep = NULL), -# tibble::tibble( -# geo_value = "x", -# time_value = epix_slide_ref_time_values_default(ea), -# value = 42 -# ) %>% -# new_epi_df(as_of = ea$versions_end) -# ) -# }) - -test_that("epix_slide alerts if the provided f doesn't take enough args", { - f_xgt <- function(x, g, t) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) - # If `regexp` is NA, asserts that there should be no errors/messages. - expect_error(epix_slide2(xx, f = f_xgt, before = 2L), regexp = NA) - expect_warning(epix_slide2(xx, f = f_xgt, before = 2L), regexp = NA) - - f_x_dots <- function(x, ...) dplyr::tibble(value = mean(x$binary), count = length(x$binary)) - expect_warning(epix_slide2(xx, f_x_dots, before = 2L), - class = "epiprocess__assert_sufficient_f_args__mandatory_f_args_passed_to_f_dots" - ) -}) - -test_that("epix_slide2 computation via formula can use ref_time_value", { - xx_ref <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - slide_value = c(4, 5, 6, 7) - ) %>% - group_by(geo_value) - - xx1 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - f = ~.ref_time_value, - before = 2 - ) - - expect_identical(xx1, xx_ref) - - xx2 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - f = ~.z, - before = 2 - ) - - expect_identical(xx2, xx_ref) - - xx3 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - f = ~..3, - before = 2 - ) - - expect_identical(xx3, xx_ref) -}) - -test_that("epix_slide2 computation via function can use ref_time_value", { - xx_ref <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - slide_value = c(4, 5, 6, 7) - ) %>% - group_by(geo_value) - - xx1 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - f = function(x, g, t) t, - before = 2 - ) - - expect_identical(xx1, xx_ref) -}) - -test_that("epix_slide2 computation via dots can use ref_time_value and group", { - # ref_time_value - xx_ref <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - slide_value = c(4, 5, 6, 7) - ) %>% - group_by(geo_value) - - xx1 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - before = 2, - slide_value = .ref_time_value - ) - - expect_identical(xx1, xx_ref) - - # group_key - xx_ref <- tibble( - geo_value = rep("x", 4), - time_value = c(4, 5, 6, 7), - slide_value = "x" - ) %>% - group_by(geo_value) - - # Use group_key column - xx3 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - before = 2, - slide_value = .group_key$geo_value - ) - - expect_identical(xx3, xx_ref) - - # Use entire group_key object - expect_error( - xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - before = 2, - slide_value = nrow(.group_key) - ), - NA - ) -}) - -test_that("epix_slide2 computation via dots outputs the same result using col names and the data var", { - xx_ref <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - before = 2, - sum_binary = sum(time_value) - ) - - xx1 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - before = 2, - sum_binary = sum(.x$time_value) - ) - - expect_identical(xx1, xx_ref) - - xx2 <- xx %>% - group_by(.data$geo_value) %>% - epix_slide2( - before = 2, - sum_binary = sum(.data$time_value) - ) - - expect_identical(xx2, xx_ref) -}) - -test_that("`epix_slide2` doesn't decay date output", { - expect_true( - xx$DT %>% - as_tibble() %>% - mutate(across(c(time_value, version), ~ as.Date("2000-01-01") + .x - 1L)) %>% - as_epi_archive2() %>% - epix_slide2(before = 5L, ~ attr(.x, "metadata")$as_of) %>% - `[[`("slide_value") %>% - inherits("Date") - ) -}) - -test_that("`epix_slide2` can access objects inside of helper functions", { - helper <- function(archive_haystack, time_value_needle) { - archive_haystack %>% epix_slide2(has_needle = time_value_needle %in% time_value, before = 365000L) - } - expect_error( - helper(archive_cases_dv_subset_2, as.Date("2021-01-01")), - NA - ) - expect_error( - helper(xx, 3L), - NA - ) -}) diff --git a/tests/testthat/test-grouped_epi_archive.R b/tests/testthat/test-grouped_epi_archive.R index 45251a89..413741aa 100644 --- a/tests/testthat/test-grouped_epi_archive.R +++ b/tests/testthat/test-grouped_epi_archive.R @@ -16,10 +16,6 @@ test_that("Grouping, regrouping, and ungrouping archives works as intended", { ) %>% as_epi_archive(other_keys = "age_group") - # Ensure that we're using testthat edition 3's idea of "identical", which is - # not as strict as `identical`: - testthat::local_edition(3) - # Test equivalency claims in example: by_both_keys <- toy_archive %>% group_by(geo_value, age_group) expect_identical( diff --git a/tests/testthat/test-grouped_epi_archive_new.R b/tests/testthat/test-grouped_epi_archive_new.R deleted file mode 100644 index 8f0133b9..00000000 --- a/tests/testthat/test-grouped_epi_archive_new.R +++ /dev/null @@ -1,104 +0,0 @@ -test_that("Grouping, regrouping, and ungrouping archives works as intended", { - # From an example: - library(dplyr) - toy_archive <- - tribble( - ~geo_value, ~age_group, ~time_value, ~version, ~value, - "us", "adult", "2000-01-01", "2000-01-02", 121, - "us", "pediatric", "2000-01-02", "2000-01-03", 5, # (addition) - "us", "adult", "2000-01-01", "2000-01-03", 125, # (revision) - "us", "adult", "2000-01-02", "2000-01-03", 130 # (addition) - ) %>% - mutate( - age_group = ordered(age_group, c("pediatric", "adult")), - time_value = as.Date(time_value), - version = as.Date(version) - ) %>% - as_epi_archive2(other_keys = "age_group") - - # Ensure that we're using testthat edition 3's idea of "identical", which is - # not as strict as `identical`: - testthat::local_edition(3) - - # Test equivalency claims in example: - by_both_keys <- toy_archive %>% group_by(geo_value, age_group) - expect_identical( - by_both_keys, - toy_archive %>% group_by(geo_value) %>% group_by(age_group, .add = TRUE) - ) - grouping_cols <- c("geo_value", "age_group") - expect_identical( - by_both_keys, - toy_archive %>% group_by(across(all_of(grouping_cols))) - ) - - expect_identical( - toy_archive %>% group_by(geo_value), - toy_archive %>% group_by(geo_value, age_group) %>% ungroup(age_group) - ) - - # Test `.drop` behavior: - expect_error(toy_archive %>% group_by(.drop = "bogus"), - regexp = "Must be of type 'logical', not 'character'" - ) - expect_warning(toy_archive %>% group_by(.drop = FALSE), - class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" - ) - expect_warning(toy_archive %>% group_by(geo_value, .drop = FALSE), - class = "epiprocess__group_by_epi_archive__drop_FALSE_no_factors" - ) - expect_warning( - grouped_factor_then_nonfactor <- - toy_archive %>% group_by(age_group, geo_value, .drop = FALSE), - class = "epiprocess__group_by_epi_archive__drop_FALSE_nonfactor_after_factor" - ) - expect_identical( - grouped_factor_then_nonfactor %>% - epix_slide2(before = 10, s = sum(value)), - tibble::tribble( - ~age_group, ~geo_value, ~time_value, ~s, - "pediatric", NA_character_, "2000-01-02", 0, - "adult", "us", "2000-01-02", 121, - "pediatric", "us", "2000-01-03", 5, - "adult", "us", "2000-01-03", 255 - ) %>% - mutate( - age_group = ordered(age_group, c("pediatric", "adult")), - time_value = as.Date(time_value) - ) %>% - # # See - # # https://github.com/cmu-delphi/epiprocess/pull/290#issuecomment-1489099157 - # # and - # # https://github.com/cmu-delphi/epiprocess/pull/311#issuecomment-1535149256 - # # for why this is commented out, pending some design - # # decisions. - # # - # as_epi_df(geo_type = "nation", # bug; want "custom" from NA; issue #242 - # as_of = as.Date("2000-01-03"), - # additional_metadata = list(other_keys = "age_group")) %>% - # # put back in expected order; see issue #166: - # select(age_group, geo_value, time_value, s) %>% - group_by(age_group, geo_value, .drop = FALSE) - ) - expect_identical( - toy_archive %>% - group_by(geo_value, age_group, .drop = FALSE) %>% - epix_slide2(before = 10, s = sum(value)), - tibble::tribble( - ~geo_value, ~age_group, ~time_value, ~s, - "us", "pediatric", "2000-01-02", 0, - "us", "adult", "2000-01-02", 121, - "us", "pediatric", "2000-01-03", 5, - "us", "adult", "2000-01-03", 255 - ) %>% - mutate( - age_group = ordered(age_group, c("pediatric", "adult")), - time_value = as.Date(time_value) - ) %>% - # as_epi_df(as_of = as.Date("2000-01-03"), - # additional_metadata = list(other_keys = "age_group")) %>% - # # put back in expected order; see issue #166: - # select(geo_value, age_group, time_value, s) %>% - group_by(geo_value, age_group, .drop = FALSE) - ) -}) diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index 7ab63f19..5be5330f 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -1,6 +1,7 @@ library(dplyr) -ea <- archive_cases_dv_subset$clone() +ea <- archive_cases_dv_subset %>% + clone() ea2_data <- tibble::tribble( ~geo_value, ~time_value, ~version, ~cases, @@ -17,35 +18,27 @@ ea2_data <- tibble::tribble( ) %>% dplyr::mutate(dplyr::across(c(time_value, version), as.Date)) -# epix_as_of tests -test_that("epix_as_of behaves identically to as_of method", { - expect_identical( - epix_as_of(ea, max_version = min(ea$DT$version)), - ea$as_of(max_version = min(ea$DT$version)) - ) -}) - -test_that("Errors are thrown due to bad as_of inputs", { +test_that("Errors are thrown due to bad epix_as_of inputs", { # max_version cannot be of string class rather than date class - expect_error(ea$as_of("2020-01-01")) + expect_error(ea %>% epix_as_of("2020-01-01")) # max_version cannot be later than latest version - expect_error(ea$as_of(as.Date("2025-01-01"))) + expect_error(ea %>% epix_as_of(as.Date("2025-01-01"))) # max_version cannot be a vector - expect_error(ea$as_of(c(as.Date("2020-01-01"), as.Date("2020-01-02")))) + expect_error(ea %>% epix_as_of(c(as.Date("2020-01-01"), as.Date("2020-01-02")))) }) test_that("Warning against max_version being clobberable", { # none by default - expect_warning(regexp = NA, ea$as_of(max_version = max(ea$DT$version))) - expect_warning(regexp = NA, ea$as_of(max_version = min(ea$DT$version))) + expect_warning(regexp = NA, ea %>% epix_as_of(max_version = max(ea$DT$version))) + expect_warning(regexp = NA, ea %>% epix_as_of(max_version = min(ea$DT$version))) # but with `clobberable_versions_start` non-`NA`, yes - ea_with_clobberable <- ea$clone() + ea_with_clobberable <- ea %>% clone() ea_with_clobberable$clobberable_versions_start <- max(ea_with_clobberable$DT$version) - expect_warning(ea_with_clobberable$as_of(max_version = max(ea$DT$version))) - expect_warning(regexp = NA, ea_with_clobberable$as_of(max_version = min(ea$DT$version))) + expect_warning(ea_with_clobberable %>% epix_as_of(max_version = max(ea$DT$version))) + expect_warning(regexp = NA, ea_with_clobberable %>% epix_as_of(max_version = min(ea$DT$version))) }) -test_that("as_of properly grabs the data and doesn't mutate key", { +test_that("epix_as_of properly grabs the data and doesn't mutate key", { d <- as.Date("2020-06-01") ea2 <- ea2_data %>% @@ -99,7 +92,7 @@ test_that("epix_truncate_version_after doesn't filter if max_verion at latest ve ea2 <- ea2_data %>% as_epi_archive() - ea_expected <- ea2$clone() + ea_expected <- ea2 %>% clone() ea_as_of <- ea2 %>% epix_truncate_versions_after(max_version = as.Date("2020-06-04")) @@ -114,7 +107,7 @@ test_that("epix_truncate_version_after returns the same grouping type as input e epix_truncate_versions_after(max_version = as.Date("2020-06-04")) expect_true(is_epi_archive(ea_as_of, grouped_okay = FALSE)) - ea2_grouped <- ea2$group_by(geo_value) + ea2_grouped <- ea2 %>% group_by(geo_value) ea_as_of <- ea2_grouped %>% epix_truncate_versions_after(max_version = as.Date("2020-06-04")) @@ -125,11 +118,11 @@ test_that("epix_truncate_version_after returns the same grouping type as input e test_that("epix_truncate_version_after returns the same groups as input grouped_epi_archive", { ea2 <- ea2_data %>% as_epi_archive() - ea2 <- ea2$group_by(geo_value) + ea2 <- ea2 %>% group_by(geo_value) - ea_expected <- ea2$clone() + ea_expected <- ea2 %>% clone() ea_as_of <- ea2 %>% epix_truncate_versions_after(max_version = as.Date("2020-06-04")) - expect_equal(ea_as_of$groups(), ea_expected$groups()) + expect_equal(ea_as_of %>% groups(), ea_expected %>% groups()) }) diff --git a/tests/testthat/test-methods-epi_archive_new.R b/tests/testthat/test-methods-epi_archive_new.R deleted file mode 100644 index eb2c14be..00000000 --- a/tests/testthat/test-methods-epi_archive_new.R +++ /dev/null @@ -1,136 +0,0 @@ -library(dplyr) - -ea <- archive_cases_dv_subset_2 %>% - clone() - -ea2_data <- tibble::tribble( - ~geo_value, ~time_value, ~version, ~cases, - "ca", "2020-06-01", "2020-06-01", 1, - "ca", "2020-06-01", "2020-06-02", 2, - # - "ca", "2020-06-02", "2020-06-02", 0, - "ca", "2020-06-02", "2020-06-03", 1, - "ca", "2020-06-02", "2020-06-04", 2, - # - "ca", "2020-06-03", "2020-06-03", 1, - # - "ca", "2020-06-04", "2020-06-04", 4, -) %>% - dplyr::mutate(dplyr::across(c(time_value, version), as.Date)) - -# epix_as_of tests -test_that("epix_as_of behaves identically to as_of method", { - expect_identical( - epix_as_of2(ea, max_version = min(ea$DT$version)), - ea %>% as_of(max_version = min(ea$DT$version)) - ) -}) - -test_that("Errors are thrown due to bad as_of inputs", { - # max_version cannot be of string class rather than date class - expect_error(ea %>% as_of("2020-01-01")) - # max_version cannot be later than latest version - expect_error(ea %>% as_of(as.Date("2025-01-01"))) - # max_version cannot be a vector - expect_error(ea %>% as_of(c(as.Date("2020-01-01"), as.Date("2020-01-02")))) -}) - -test_that("Warning against max_version being clobberable", { - # none by default - expect_warning(regexp = NA, ea %>% as_of(max_version = max(ea$DT$version))) - expect_warning(regexp = NA, ea %>% as_of(max_version = min(ea$DT$version))) - # but with `clobberable_versions_start` non-`NA`, yes - ea_with_clobberable <- ea %>% clone() - ea_with_clobberable$clobberable_versions_start <- max(ea_with_clobberable$DT$version) - expect_warning(ea_with_clobberable %>% as_of(max_version = max(ea$DT$version))) - expect_warning(regexp = NA, ea_with_clobberable %>% as_of(max_version = min(ea$DT$version))) -}) - -test_that("as_of properly grabs the data and doesn't mutate key", { - d <- as.Date("2020-06-01") - - ea2 <- ea2_data %>% - as_epi_archive2() - - old_key <- data.table::key(ea2$DT) - - edf_as_of <- ea2 %>% - epix_as_of2(max_version = as.Date("2020-06-03")) - - edf_expected <- as_epi_df(tibble( - geo_value = "ca", - time_value = d + 0:2, - cases = c(2, 1, 1) - ), as_of = as.Date("2020-06-03")) - - expect_equal(edf_as_of, edf_expected, ignore_attr = c(".internal.selfref", "sorted")) - expect_equal(data.table::key(ea2$DT), old_key) -}) - -test_that("Errors are thrown due to bad epix_truncate_versions_after inputs", { - # x must be an archive - expect_error(epix_truncate_versions_after(data.frame(), as.Date("2020-01-01"))) - # max_version cannot be of string class rather than date class - expect_error(epix_truncate_versions_after(ea, "2020-01-01")) - # max_version cannot be a vector - expect_error(epix_truncate_versions_after(ea, c(as.Date("2020-01-01"), as.Date("2020-01-02")))) - # max_version cannot be missing - expect_error(epix_truncate_versions_after(ea, as.Date(NA))) - # max_version cannot be after latest version in archive - expect_error(epix_truncate_versions_after(ea, as.Date("2025-01-01"))) -}) - -test_that("epix_truncate_version_after properly grabs the data and doesn't mutate key", { - ea2 <- ea2_data %>% - as_epi_archive2() - - old_key <- data.table::key(ea2$DT) - - ea_as_of <- ea2 %>% - epix_truncate_versions_after(max_version = as.Date("2020-06-02")) - - ea_expected <- ea2_data[1:3, ] %>% - as_epi_archive2() - - expect_equal(ea_as_of, ea_expected, ignore_attr = c(".internal.selfref", "sorted")) - expect_equal(data.table::key(ea2$DT), old_key) -}) - -test_that("epix_truncate_version_after doesn't filter if max_verion at latest version", { - ea2 <- ea2_data %>% - as_epi_archive2() - - ea_expected <- ea2 %>% clone() - - ea_as_of <- ea2 %>% - epix_truncate_versions_after(max_version = as.Date("2020-06-04")) - expect_equal(ea_as_of, ea_expected, ignore_attr = c(".internal.selfref", "sorted")) -}) - -test_that("epix_truncate_version_after returns the same grouping type as input epi_archive", { - ea2 <- ea2_data %>% - as_epi_archive2() - - ea_as_of <- ea2 %>% - epix_truncate_versions_after(max_version = as.Date("2020-06-04")) - expect_true(is_epi_archive2(ea_as_of, grouped_okay = FALSE)) - - ea2_grouped <- ea2 %>% group_by(geo_value) - - ea_as_of <- ea2_grouped %>% - epix_truncate_versions_after(max_version = as.Date("2020-06-04")) - expect_true(is_grouped_epi_archive2(ea_as_of)) -}) - - -test_that("epix_truncate_version_after returns the same groups as input grouped_epi_archive", { - ea2 <- ea2_data %>% - as_epi_archive2() - ea2 <- ea2 %>% group_by(geo_value) - - ea_expected <- ea2 %>% clone() - - ea_as_of <- ea2 %>% - epix_truncate_versions_after(max_version = as.Date("2020-06-04")) - expect_equal(ea_as_of %>% groups(), ea_expected %>% groups()) -}) diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 83cc07f6..dbe15450 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -1,13 +1,3 @@ -test_that("new summarizing functions work", { - x <- c(3, 4, 5, 9, NA) - expect_equal(min_na_rm(x), 3) -}) - -test_that("Other capital letter functions work", { - x <- c(1, 2, 3, 4, 5) - expect_equal(extend_r(x), c(1, 2, 3, 4, 5, 5)) -}) - test_that("guess_geo_type tests for different types of geo_value's", { # California, New York states <- c("ca", "ny") diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index c010c1f3..1ea13c5f 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -9,14 +9,13 @@ vignette: > In this vignette, we discuss how to use the sliding functionality in the `epiprocess` package with less common grouping schemes or with computations that -have advanced output structures. -The output of a slide computation should either be an atomic value/vector, or a -data frame. This data frame can have multiple columns, multiple rows, or both. +have advanced output structures. The output of a slide computation should either +be an atomic value/vector, or a data frame. This data frame can have multiple +columns, multiple rows, or both. During basic usage (e.g., when all optional arguments are set to their defaults): * `epi_slide(edf, , .....)`: - * keeps **all** columns of `edf`, adds computed column(s) * outputs **one row per row in `edf`** (recycling outputs from computations appropriately if there are multiple time series bundled @@ -26,9 +25,7 @@ During basic usage (e.g., when all optional arguments are set to their defaults) `dplyr::arrange(time_value, .by_group = TRUE)`** * outputs an **`epi_df`** if the required columns are present, otherwise a tibble - * `epix_slide(ea, , .....)`: - * keeps **grouping and `time_value`** columns of `ea`, adds computed column(s) * outputs **any number of rows** (computations are allowed to output any @@ -40,6 +37,7 @@ During basic usage (e.g., when all optional arguments are set to their defaults) * outputs a **tibble** These differences in basic behavior make some common slide operations require less boilerplate: + * predictors and targets calculated with `epi_slide` are automatically lined up with each other and with the signals from which they were calculated; and * computations for an `epix_slide` can output data frames with any number of @@ -84,13 +82,14 @@ simple synthetic example. ```{r message = FALSE} library(epiprocess) library(dplyr) +set.seed(123) edf <- tibble( geo_value = rep(c("ca", "fl", "pa"), each = 3), time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = length(geo_value)), x = seq_along(geo_value) + 0.01 * rnorm(length(geo_value)), ) %>% - as_epi_df() + as_epi_df(as_of = as.Date("2024-03-20")) # 2-day trailing average, per geo value edf %>% @@ -111,17 +110,17 @@ edf %>% edf %>% # pretend that observations about time_value t are reported in version t (nowcasts) mutate(version = time_value) %>% - as_epi_archive2() %>% + as_epi_archive() %>% group_by(geo_value) %>% - epix_slide2(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% + epix_slide(x_2dav = mean(x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% ungroup() edf %>% # pretend that observations about time_value t are reported in version t (nowcasts) mutate(version = time_value) %>% - as_epi_archive2() %>% + as_epi_archive() %>% group_by(geo_value) %>% - epix_slide2(~ mean(.x$x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% + epix_slide(~ mean(.x$x), before = 1, ref_time_values = as.Date("2020-06-02")) %>% ungroup() ``` @@ -219,9 +218,9 @@ edf %>% edf %>% mutate(version = time_value) %>% - as_epi_archive2() %>% + as_epi_archive() %>% group_by(geo_value) %>% - epix_slide2( + epix_slide( a = data.frame(x_2dav = mean(x), x_2dma = mad(x)), ref_time_values = as.Date("2020-06-02"), before = 1, as_list_col = FALSE, names_sep = NULL @@ -317,17 +316,17 @@ x <- y1 %>% version = issue, percent_cli = value ) %>% - as_epi_archive2(compactify = FALSE) + as_epi_archive(compactify = FALSE) # mutating merge operation: -x <- epix_merge2( +x <- epix_merge( x, y2 %>% select(geo_value, time_value, version = issue, case_rate_7d_av = value ) %>% - as_epi_archive2(compactify = FALSE), + as_epi_archive(compactify = FALSE), sync = "locf", compactify = FALSE ) @@ -338,9 +337,9 @@ library(data.table) library(ggplot2) theme_set(theme_bw()) -x <- archive_cases_dv_subset_2$DT %>% +x <- archive_cases_dv_subset$DT %>% filter(geo_value %in% c("ca", "fl")) %>% - as_epi_archive2(compactify = FALSE) + as_epi_archive(compactify = FALSE) ``` Next, we extend the ARX function to handle multiple geo values, since in the @@ -458,7 +457,7 @@ data. ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} # Latest snapshot of data, and forecast dates -x_latest <- epix_as_of2(x, max_version = max(x$DT$version)) +x_latest <- epix_as_of(x, max_version = max(x$DT$version)) fc_time_values <- seq(as.Date("2020-08-01"), as.Date("2021-11-30"), by = "1 month" @@ -468,7 +467,7 @@ fc_time_values <- seq(as.Date("2020-08-01"), k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { if (as_of) { x %>% - epix_slide2( + epix_slide( fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, .data$geo_value, .data$time_value, args = prob_arx_args(ahead = ahead) ), @@ -525,10 +524,7 @@ separate ARX model on each state. As in the archive vignette, we can see a difference between version-aware (right column) and -unaware (left column) forecasting, as well. - ## Attribution The `case_rate_7d_av` data used in this document is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. The `percent_cli` data is a modified part of the [COVIDcast Epidata API Doctor Visits data](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). This dataset is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/). Copyright Delphi Research Group at Carnegie Mellon University 2020. - - diff --git a/vignettes/aggregation.Rmd b/vignettes/aggregation.Rmd index 205ed084..dca595ff 100644 --- a/vignettes/aggregation.Rmd +++ b/vignettes/aggregation.Rmd @@ -34,7 +34,7 @@ x <- pub_covidcast( ) %>% select(geo_value, time_value, cases = value) %>% full_join(y, by = "geo_value") %>% - as_epi_df() + as_epi_df(as_of = as.Date("2024-03-20")) ``` The data contains 16,212 rows and 5 columns. @@ -192,7 +192,7 @@ running `epi_slide()` on the zero-filled data brings these trailing averages ```{r} xt %>% - as_epi_df() %>% + as_epi_df(as_of = as.Date("2024-03-20")) %>% group_by(geo_value) %>% epi_slide(cases_7dav = mean(cases), before = 6) %>% ungroup() %>% @@ -203,7 +203,7 @@ xt %>% print(n = 7) xt_filled %>% - as_epi_df() %>% + as_epi_df(as_of = as.Date("2024-03-20")) %>% group_by(geo_value) %>% epi_slide(cases_7dav = mean(cases), before = 6) %>% ungroup() %>% diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index 0b57d639..6193981a 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -7,16 +7,16 @@ vignette: > %\VignetteEncoding{UTF-8} --- -In addition to the `epi_df` data structure, which we have been working with all -along in these vignettes, the `epiprocess` package has a companion structure -called `epi_archive`. In comparison to an `epi_df` object, which can be seen as -storing a single snapshot of a data set with the most up-to-date signal values -as of some given time, an `epi_archive` object stores the full version history -of a data set. Many signals of interest for epidemiological tracking are subject -to revision (some more than others), and paying attention to data revisions can -be important for all sorts of downstream data analysis and modeling tasks. - -This vignette walks through working with `epi_archive` objects and demonstrates +In addition to the `epi_df` data structure, the `epiprocess` package has a +companion structure called `epi_archive`. In comparison to an `epi_df` object, +which can be seen as storing a single snapshot of a data set with the most +up-to-date signal values as of some given time, an `epi_archive` object stores +the full version history of a data set. Many signals of interest for +epidemiological tracking are subject to revision (some more than others) and +paying attention to data revisions can be important for all sorts of downstream +data analysis and modeling tasks. + +This vignette walks through working with `epi_archive()` objects and demonstrates some of their key functionality. We'll work with a signal on the percentage of doctor's visits with CLI (COVID-like illness) computed from medical insurance claims, available through the [COVIDcast @@ -55,9 +55,8 @@ library(ggplot2) ## Getting data into `epi_archive` format -An epi_archive object -can be constructed from a data frame, data table, or tibble, provided that it -has (at least) the following columns: +An `epi_archive()` object can be constructed from a data frame, data table, or +tibble, provided that it has (at least) the following columns: * `geo_value`: the geographic value associated with each row of measurements. * `time_value`: the time value associated with each row of measurements. @@ -71,30 +70,30 @@ As we can see from the above, the data frame returned by format, with `issue` playing the role of `version`. We can now use `as_epi_archive()` to bring it into `epi_archive` format. For removal of redundant version updates in `as_epi_archive` using compactify, please refer to -the compactify vignette. +the [compactify vignette](articles/compactify.html). ```{r, eval=FALSE} x <- dv %>% select(geo_value, time_value, version = issue, percent_cli = value) %>% - as_epi_archive2(compactify = TRUE) + as_epi_archive(compactify = TRUE) class(x) print(x) ``` ```{r, echo=FALSE, message=FALSE, warning=FALSE} -x <- archive_cases_dv_subset_2$DT %>% +x <- archive_cases_dv_subset$DT %>% select(geo_value, time_value, version, percent_cli) %>% - as_epi_archive2(compactify = TRUE) + as_epi_archive(compactify = TRUE) class(x) print(x) ``` -An `epi_archive` is special kind of class called an R6 class. Its primary field -is a data table `DT`, which is of class `data.table` (from the `data.table` -package), and has columns `geo_value`, `time_value`, `version`, as well as any -number of additional columns. +An `epi_archive` is consists of a primary field `DT`, which is a data table +(from the `data.table` package) that has the columns `geo_value`, `time_value`, +`version` (and possibly additional ones), and other metadata fields, such as +`geo_type` and `time_type`. ```{r} class(x$DT) @@ -112,9 +111,7 @@ key(x$DT) ``` In general, the last version of each observation is carried forward (LOCF) to -fill in data between recorded versions. **A word of caution:** R6 objects, -unlike most other objects in R, have reference semantics. An important -consequence of this is that objects are not copied when modified. +fill in data between recorded versions. ```{r} original_value <- x$DT$percent_cli[1] @@ -125,10 +122,6 @@ head(x$DT) x$DT$percent_cli[1] <- original_value ``` -To make a copy, we can use the `clone()` method for an R6 class, as in `y <- -x$clone()`. You can read more about reference semantics in Hadley Wickham's -[Advanced R](https://adv-r.hadley.nz/r6.html#r6-semantics) book. - ## Some details on metadata The following pieces of metadata are included as fields in an `epi_archive` @@ -146,15 +139,12 @@ call (as it did in the case above). ## Producing snapshots in `epi_df` form -A key method of an `epi_archive` class is `as_of()`, which generates a snapshot +A key method of an `epi_archive` class is `epix_as_of()`, which generates a snapshot of the archive in `epi_df` format. This represents the most up-to-date values of -the signal variables as of a given version. This can be accessed via `x$as_of()` -for an `epi_archive` object `x`, but the package also provides a simple wrapper -function `epix_as_of()` since this is likely a more familiar interface for users -not familiar with R6 (or object-oriented programming). +the signal variables as of a given version. ```{r} -x_snapshot <- epix_as_of2(x, max_version = as.Date("2021-06-01")) +x_snapshot <- epix_as_of(x, max_version = as.Date("2021-06-01")) class(x_snapshot) head(x_snapshot) max(x_snapshot$time_value) @@ -174,7 +164,7 @@ this case, since updates to the current version may still come in at a later point in time, due to various reasons, such as synchronization issues. ```{r} -x_latest <- epix_as_of2(x, max_version = max(x$DT$version)) +x_latest <- epix_as_of(x, max_version = max(x$DT$version)) ``` Below, we pull several snapshots from the archive, spaced one month apart. We @@ -188,7 +178,7 @@ theme_set(theme_bw()) self_max <- max(x$DT$version) versions <- seq(as.Date("2020-06-01"), self_max - 1, by = "1 month") snapshots <- map_dfr(versions, function(v) { - epix_as_of2(x, max_version = v) %>% mutate(version = v) + epix_as_of(x, max_version = v) %>% mutate(version = v) }) %>% bind_rows( x_latest %>% mutate(version = self_max) @@ -219,19 +209,14 @@ they overestimate it (both states towards the beginning of 2021), though not quite as dramatically. Modeling the revision process, which is often called *backfill modeling*, is an important statistical problem in it of itself. - - ## Merging `epi_archive` objects Now we demonstrate how to merge two `epi_archive` objects together, e.g., so that grabbing data from multiple sources as of a particular version can be -performed with a single `as_of` call. The `epi_archive` class provides a method -`merge()` precisely for this purpose. The wrapper function is called -`epix_merge()`; this wrapper avoids mutating its inputs, while `x$merge` will -mutate `x`. Below we merge the working `epi_archive` of versioned percentage CLI -from outpatient visits to another one of versioned COVID-19 case reporting data, -which we fetch the from the [COVIDcast +performed with a single `epix_as_of` call. The function `epix_merge()` is made +for this purpose. Below we merge the working `epi_archive` of versioned +percentage CLI from outpatient visits to another one of versioned COVID-19 case +reporting data, which we fetch the from the [COVIDcast API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html/), on the rate scale (counts per 100,000 people in the population). @@ -258,34 +243,26 @@ y <- pub_covidcast( issues = epirange(20200601, 20211201) ) %>% select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>% - as_epi_archive2(compactify = TRUE) + as_epi_archive(compactify = TRUE) -x <- epix_merge2(x, y, sync = "locf", compactify = TRUE) +x <- epix_merge(x, y, sync = "locf", compactify = TRUE) print(x) head(x$DT) ``` ```{r, echo=FALSE, message=FALSE, warning=FALSE} -x <- archive_cases_dv_subset_2 +x <- archive_cases_dv_subset print(x) head(x$DT) ``` -Importantly, see that `x$merge` mutated `x` to hold the result of the merge. We -could also have used `xy = epix_merge(x,y)` to avoid mutating `x`. See the -documentation for either for more detailed descriptions of what mutation, -pointer aliasing, and pointer reseating is possible. - ## Sliding version-aware computations -Lastly, we demonstrate another key method of the `epi_archive` class, which is -the `slide()` method. It works just like `epi_slide()` does for an `epi_df` -object, but with one key difference: it performs version-aware computations. -That is, for the computation at any given reference time t, it only uses **data -that would have been available as of t**. The wrapper function is called -`epix_slide()`; again, this is just for convenience/familiarity---and its -interface is purposely designed mirror that of `epi_slide()` for `epi_df` -objects. +Lastly, we demonstrate another key method for archives, which is the +`epix_slide()`. It works just like `epi_slide()` does for an `epi_df` object, +but with one key difference: it performs version-aware computations. That is, +for the computation at any given reference time t, it only uses **data that +would have been available as of t**. For the demonstration, we'll revisit the forecasting example from the [slide vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html), and now @@ -362,7 +339,7 @@ fc_time_values <- seq(as.Date("2020-08-01"), z <- x %>% group_by(geo_value) %>% - epix_slide2( + epix_slide( fc = prob_arx(x = percent_cli, y = case_rate_7d_av), before = 119, ref_time_values = fc_time_values ) %>% @@ -389,14 +366,14 @@ points in time and forecast horizons. The former comes from using `epi_slide()` to the latest snapshot of the data `x_latest`. ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} -x_latest <- epix_as_of2(x, max_version = max(x$DT$version)) +x_latest <- epix_as_of(x, max_version = max(x$DT$version)) # Simple function to produce forecasts k weeks ahead k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { if (as_of) { x %>% group_by(.data$geo_value) %>% - epix_slide2( + epix_slide( fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, ahead = ahead), before = 119, ref_time_values = fc_time_values ) %>% diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd index 0b68c73b..8579be6a 100644 --- a/vignettes/compactify.Rmd +++ b/vignettes/compactify.Rmd @@ -32,10 +32,10 @@ from the second from the third value included. library(epiprocess) library(dplyr) -dt <- archive_cases_dv_subset_2$DT +dt <- archive_cases_dv_subset$DT -locf_omitted <- as_epi_archive2(dt) -locf_included <- as_epi_archive2(dt, compactify = FALSE) +locf_omitted <- as_epi_archive(dt) +locf_included <- as_epi_archive(dt, compactify = FALSE) head(locf_omitted$DT) head(locf_included$DT) @@ -48,8 +48,8 @@ LOCF-redundant values can mar the performance of dataset operations. As the colu ```{r} dt2 <- select(dt, -percent_cli) -locf_included_2 <- as_epi_archive2(dt2, compactify = FALSE) -locf_omitted_2 <- as_epi_archive2(dt2, compactify = TRUE) +locf_included_2 <- as_epi_archive(dt2, compactify = FALSE) +locf_omitted_2 <- as_epi_archive(dt2, compactify = TRUE) ``` In this example, a huge proportion of the original version update data were @@ -93,7 +93,7 @@ We would also like to measure the speed of `epi_archive` methods. # Performance of as_of iterated 200 times iterate_as_of <- function(my_ea) { for (i in 1:1000) { - my_ea$as_of(min(my_ea$DT$time_value) + i - 1000) + my_ea %>% epix_as_of(min(my_ea$DT$time_value) + i - 1000) } } @@ -101,11 +101,12 @@ speeds <- rbind(speeds, speed_test(iterate_as_of, "as_of_1000x")) # Performance of slide slide_median <- function(my_ea) { - my_ea$slide(median = median(.data$case_rate_7d_av), before = 7) + my_ea %>% epix_slide(median = median(.data$case_rate_7d_av), before = 7) } speeds <- rbind(speeds, speed_test(slide_median, "slide_median")) ``` + Here is a detailed performance comparison: ```{r} diff --git a/vignettes/epiprocess.Rmd b/vignettes/epiprocess.Rmd index 85b1e1f4..12020d89 100644 --- a/vignettes/epiprocess.Rmd +++ b/vignettes/epiprocess.Rmd @@ -125,7 +125,7 @@ and `time_value` columns, respectively, but inferring the `as_of` field is not as easy. See the documentation for `as_epi_df()` more details. ```{r} -x <- as_epi_df(cases) %>% +x <- as_epi_df(cases, as_of = as.Date("2024-03-20")) %>% select(geo_value, time_value, total_cases = value) attributes(x)$metadata @@ -169,7 +169,7 @@ data.frame( # misnamed reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = length(geo_value)), value = seq_along(geo_value) + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, length(geo_value))) -) %>% as_epi_df() +) %>% as_epi_df(as_of = as.Date("2024-03-20")) ``` The columns can be renamed to match `epi_df` format. In the example below, notice there is also an additional key `pol`. @@ -220,7 +220,7 @@ ex3 <- ex3 %>% state = rep(tolower("MA"), 6), pol = rep(c("blue", "swing", "swing"), each = 2) ) %>% - as_epi_df(additional_metadata = list(other_keys = c("state", "pol"))) + as_epi_df(additional_metadata = list(other_keys = c("state", "pol")), as_of = as.Date("2024-03-20")) attr(ex3, "metadata") ``` @@ -256,7 +256,7 @@ cases in Canada in 2003, from the x <- outbreaks::sars_canada_2003 %>% mutate(geo_value = "ca") %>% select(geo_value, time_value = date, starts_with("cases")) %>% - as_epi_df(geo_type = "nation") + as_epi_df(geo_type = "nation", as_of = as.Date("2024-03-20")) head(x) @@ -303,7 +303,7 @@ x <- outbreaks::ebola_sierraleone_2014 %>% filter(cases == 1) %>% group_by(geo_value, time_value) %>% summarise(cases = sum(cases)) %>% - as_epi_df(geo_type = "province") + as_epi_df(geo_type = "province", as_of = as.Date("2024-03-20")) ggplot(x, aes(x = time_value, y = cases)) + geom_col(aes(fill = geo_value), show.legend = FALSE) + @@ -312,11 +312,8 @@ ggplot(x, aes(x = time_value, y = cases)) + labs(x = "Date", y = "Confirmed cases of Ebola in Sierra Leone") ``` - - ## Attribution This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. [From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. - From 183d0f1230f643ee1c355ee24b4c27e3de65a798 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 22 Apr 2024 15:11:34 -0700 Subject: [PATCH 03/18] `new_archive` validation tightening, streamlining, type stability - Forbid `NA` `compactify` - Remove `missing` checks when `is.null` suffices - Remove redundant default code - Make local `other_keys` have more consistent typing across branches --- R/archive.R | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/R/archive.R b/R/archive.R index f871d239..7d9a9539 100644 --- a/R/archive.R +++ b/R/archive.R @@ -307,13 +307,13 @@ new_epi_archive <- function( } # If time type is missing, then try to guess it - if (missing(time_type) || is.null(time_type)) { + if (is.null(time_type)) { time_type <- guess_time_type(x$time_value) } # Finish off with small checks on keys variables and metadata - if (missing(other_keys)) other_keys <- NULL - if (missing(additional_metadata) || is.null(additional_metadata)) additional_metadata <- list() + if (is.null(other_keys)) other_keys <- character(0L) + if (is.null(additional_metadata)) additional_metadata <- list() if (!test_subset(other_keys, names(x))) { cli_abort("`other_keys` must be contained in the column names of `x`.") } @@ -325,17 +325,11 @@ new_epi_archive <- function( } # Conduct checks and apply defaults for `compactify` - if (missing(compactify)) { - compactify <- NULL - } - assert_logical(compactify, len = 1, null.ok = TRUE) + assert_logical(compactify, len = 1, any.missing = FALSE, null.ok = TRUE) # Apply defaults and conduct checks for # `clobberable_versions_start`, `versions_end`: - if (missing(clobberable_versions_start)) { - clobberable_versions_start <- NA - } - if (missing(versions_end) || is.null(versions_end)) { + if (is.null(versions_end)) { versions_end <- max_version_with_row_in(x) } validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) From 7bc4735d7df9e26ca11d76561d30e52e7950d392 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 22 Apr 2024 15:55:47 -0700 Subject: [PATCH 04/18] Fix version_bound_arg validation issues - Validate length. - Tweak message regarding type since typeof is length 1. - Actually raise error if NA when NA not allowed. - Make tests check the source of the error, since not being specific + R configuration masked some of these issues. --- R/archive.R | 37 ++++++++++++++------ tests/testthat/test-archive-version-bounds.R | 23 +++++++----- 2 files changed, 41 insertions(+), 19 deletions(-) diff --git a/R/archive.R b/R/archive.R index 7d9a9539..55de1132 100644 --- a/R/archive.R +++ b/R/archive.R @@ -28,23 +28,38 @@ validate_version_bound <- function(version_bound, x, na_ok = FALSE, x_arg = rlang::caller_arg(version_bound)) { if (is.null(version_bound)) { cli_abort( - "{version_bound_arg} cannot be NULL" + "{version_bound_arg} cannot be NULL", + class = "epiprocess__version_bound_null" ) } - if (na_ok && is.na(version_bound)) { - return(invisible(NULL)) - } - if (!test_set_equal(class(version_bound), class(x[["version"]]))) { + if (length(version_bound) != 1L) { cli_abort( - "{version_bound_arg} must have the same classes as x$version, - which is {class(x$version)}", + "{version_bound_arg} must have length of 1", + class = "epiprocess__version_bound_wrong_length" ) } - if (!test_set_equal(typeof(version_bound), typeof(x[["version"]]))) { - cli_abort( - "{version_bound_arg} must have the same types as x$version, + if (is.na(version_bound)) { + if (!na_ok) { + cli_abort( + "{version_bound_arg} cannot be NA", + class = "epiprocess__version_bound_na_with_na_not_okay" + ) + } + } else { + if (!test_set_equal(class(version_bound), class(x[["version"]]))) { + cli_abort( + "{version_bound_arg} must have the same classes as x$version, + which is {class(x$version)}", + class = "epiprocess__version_bound_mismatched_class" + ) + } + if (!identical(typeof(version_bound), typeof(x[["version"]]))) { + cli_abort( + "{version_bound_arg} must have the same type as x$version, which is {typeof(x$version)}", - ) + class = "epiprocess__version_bound_mismatched_typeof" + ) + } } return(invisible(NULL)) diff --git a/tests/testthat/test-archive-version-bounds.R b/tests/testthat/test-archive-version-bounds.R index d78167d7..c052b47b 100644 --- a/tests/testthat/test-archive-version-bounds.R +++ b/tests/testthat/test-archive-version-bounds.R @@ -2,17 +2,21 @@ test_that("`validate_version_bound` allows/catches `NA` as requested", { my_version_bound <- NA x <- tibble::tibble(version = 5L) validate_version_bound(my_version_bound, x, na_ok = TRUE) - expect_error(validate_version_bound(my_version_bound, x, na_ok = FALSE)) + expect_error(validate_version_bound(my_version_bound, x, na_ok = FALSE), + class = "epiprocess__version_bound_na_with_na_not_okay") }) test_that("`validate_version_bound` catches bounds that are the wrong length", { x <- tibble::tibble(version = 5L) my_version_bound1a <- NULL - expect_error(validate_version_bound(my_version_bound1a, x, na_ok = TRUE)) + expect_error(validate_version_bound(my_version_bound1a, x, na_ok = TRUE), + class = "epiprocess__version_bound_null") my_version_bound1b <- integer(0L) - expect_error(validate_version_bound(my_version_bound1b, x, na_ok = TRUE)) + expect_error(validate_version_bound(my_version_bound1b, x, na_ok = TRUE), + class = "epiprocess__version_bound_wrong_length") my_version_bound2 <- c(2, 10) - expect_error(validate_version_bound(my_version_bound2, x, na_ok = TRUE)) + expect_error(validate_version_bound(my_version_bound2, x, na_ok = TRUE), + class = "epiprocess__version_bound_wrong_length") }) test_that("`validate_version_bound` validate and class checks together allow and catch as intended", { @@ -52,11 +56,13 @@ test_that("`validate_version_bound` validate and class checks together allow and validate_version_bound(my_datetime, x_datetime, version_bound_arg = "vb") expect_error( validate_version_bound(my_datetime, x_date, na_ok = TRUE, version_bound_arg = "vb"), - regexp = "must have the same classes" + regexp = "must have the same classes", + class = "epiprocess__version_bound_mismatched_class" ) expect_error( validate_version_bound(my_date, x_datetime, na_ok = TRUE, version_bound_arg = "vb"), - regexp = "must have the same classes" + regexp = "must have the same classes", + class = "epiprocess__version_bound_mismatched_class" ) # Bad: expect_error(validate_version_bound(3.5, x_int, TRUE, "vb"), regexp = "must have the same classes") @@ -64,7 +70,7 @@ test_that("`validate_version_bound` validate and class checks together allow and expect_error(validate_version_bound( `class<-`(list(2), "clazz"), tibble::tibble(version = `class<-`(5L, "clazz")), TRUE, "vb" - ), regexp = "must have the same types") + ), regexp = "must have the same type", class = "epiprocess__version_bound_mismatched_typeof") # Maybe questionable: expect_error(validate_version_bound(3, x_int, TRUE, "vb")) expect_error(validate_version_bound(3L, x_dbl, TRUE, "vb")) @@ -109,7 +115,8 @@ test_that("archive version bounds args work as intended", { ), regexp = "`clobberable_versions_start`.*indicated that there were later observed versions" ) - expect_error(as_epi_archive(update_tbl, versions_end = NA), regexp = "must have the same classes") + expect_error(as_epi_archive(update_tbl, versions_end = NA), + class = "epiprocess__version_bound_na_with_na_not_okay") ea_default <- as_epi_archive(update_tbl) ea_default %>% epix_as_of(measurement_date + 4L) expect_warning( From 5392641e42b945dd24366f78bee61737180a5539 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 22 Apr 2024 16:11:06 -0700 Subject: [PATCH 05/18] Don't use cli_ with dynamic format strings See https://rlang.r-lib.org/reference/topic-condition-formatting.html#transitioning-from-abort-to-cli-abort- --- R/archive.R | 14 ++++---------- R/methods-epi_archive.R | 14 +++++++------- R/methods-epi_df.R | 6 ++---- R/utils.R | 9 +++++---- tests/testthat/test-archive-version-bounds.R | 15 ++++++++++----- 5 files changed, 28 insertions(+), 30 deletions(-) diff --git a/R/archive.R b/R/archive.R index 55de1132..5e5ef436 100644 --- a/R/archive.R +++ b/R/archive.R @@ -351,21 +351,15 @@ new_epi_archive <- function( validate_version_bound(versions_end, x, na_ok = FALSE) if (nrow(x) > 0L && versions_end < max(x[["version"]])) { cli_abort( - sprintf( - "`versions_end` was %s, but `x` contained - updates for a later version or versions, up through %s", - versions_end, max(x[["version"]]) - ), + "`versions_end` was {versions_end}, but `x` contained + updates for a later version or versions, up through {max(x$version)}", class = "epiprocess__versions_end_earlier_than_updates" ) } if (!is.na(clobberable_versions_start) && clobberable_versions_start > versions_end) { cli_abort( - sprintf( - "`versions_end` was %s, but a `clobberable_versions_start` - of %s indicated that there were later observed versions", - versions_end, clobberable_versions_start - ), + "`versions_end` was {versions_end}, but a `clobberable_versions_start` + of {clobberable_versions_start} indicated that there were later observed versions", class = "epiprocess__versions_end_earlier_than_clobberable_versions_start" ) } diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index f6846488..3c351f37 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -169,13 +169,13 @@ epix_fill_through_version <- function(x, fill_versions_end, nonkey_cols <- setdiff(names(x$DT), key(x$DT)) next_version_tag <- next_after(x$versions_end) if (next_version_tag > fill_versions_end) { - cli_abort(sprintf(paste( - "Apparent problem with `next_after` method:", - "archive contained observations through version %s", - "and the next possible version was supposed to be %s,", - "but this appeared to jump from a version < %3$s", - "to one > %3$s, implying at least one version in between." - ), x$versions_end, next_version_tag, fill_versions_end)) + cli_abort(paste( + "Apparent problem with {.code next_after} method:", + "archive contained observations through version {x$versions_end}", + "and the next possible version was supposed to be {next_version_tag},", + "but this appeared to jump from a version < {fill_versions_end}", + "to one > {fill_versions_end}, implying at least one version in between." + )) } nonversion_key_vals_ever_recorded <- unique(x$DT, by = nonversion_key_cols) # In edge cases, the `unique` result can alias the original diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 632dc3a3..97fc4576 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -136,10 +136,8 @@ dplyr_reconstruct.epi_df <- function(data, template) { dup_col_names <- cn[duplicated(cn)] if (length(dup_col_names) != 0) { cli_abort(paste0( - "Column name(s) ", - paste(unique(dup_col_names), - collapse = ", " - ), " must not be duplicated." + "Column name(s) {unique(dup_col_names)}", + "must not be duplicated." )) } diff --git a/R/utils.R b/R/utils.R index ea7afc2f..0233f775 100644 --- a/R/utils.R +++ b/R/utils.R @@ -142,7 +142,7 @@ assert_sufficient_f_args <- function(f, ...) { # `f` doesn't take enough args. if (rlang::dots_n(...) == 0L) { # common case; try for friendlier error message - cli_abort(sprintf("`f` must take at least %s arguments", n_mandatory_f_args), + cli_abort("`f` must take at least {n_mandatory_f_args} arguments", class = "epiprocess__assert_sufficient_f_args__f_needs_min_args", epiprocess__f = f ) @@ -312,7 +312,7 @@ as_slide_computation <- function(f, ...) { if (is_formula(f)) { if (length(f) > 2) { - cli_abort(sprintf("%s must be a one-sided formula", arg), + cli_abort("{.code {arg}} must be a one-sided formula", class = "epiprocess__as_slide_computation__formula_is_twosided", epiprocess__f = f, call = call @@ -350,7 +350,8 @@ as_slide_computation <- function(f, ...) { } cli_abort( - sprintf("Can't convert an object of class %s to a slide computation", paste(collapse = " ", deparse(class(f)))), + "Can't convert an object of class {paste(collapse = ' ', deparse(class(f)))} + to a slide computation", class = "epiprocess__as_slide_computation__cant_convert_catchall", epiprocess__f = f, epiprocess__f_class = class(f), @@ -687,7 +688,7 @@ gcd_num <- function(dividends, ..., rrtol = 1e-6, pqlim = 1e6, irtol = 1e-6) { guess_period <- function(ref_time_values, ref_time_values_arg = rlang::caller_arg(ref_time_values)) { sorted_distinct_ref_time_values <- sort(unique(ref_time_values)) if (length(sorted_distinct_ref_time_values) < 2L) { - cli_abort(sprintf("Not enough distinct values in `%s` to guess the period.", ref_time_values_arg)) + cli_abort("Not enough distinct values in {.code {ref_time_values_arg}} to guess the period.", ref_time_values_arg) } skips <- diff(sorted_distinct_ref_time_values) decayed_skips <- diff --git a/tests/testthat/test-archive-version-bounds.R b/tests/testthat/test-archive-version-bounds.R index c052b47b..a8f12a3f 100644 --- a/tests/testthat/test-archive-version-bounds.R +++ b/tests/testthat/test-archive-version-bounds.R @@ -3,20 +3,24 @@ test_that("`validate_version_bound` allows/catches `NA` as requested", { x <- tibble::tibble(version = 5L) validate_version_bound(my_version_bound, x, na_ok = TRUE) expect_error(validate_version_bound(my_version_bound, x, na_ok = FALSE), - class = "epiprocess__version_bound_na_with_na_not_okay") + class = "epiprocess__version_bound_na_with_na_not_okay" + ) }) test_that("`validate_version_bound` catches bounds that are the wrong length", { x <- tibble::tibble(version = 5L) my_version_bound1a <- NULL expect_error(validate_version_bound(my_version_bound1a, x, na_ok = TRUE), - class = "epiprocess__version_bound_null") + class = "epiprocess__version_bound_null" + ) my_version_bound1b <- integer(0L) expect_error(validate_version_bound(my_version_bound1b, x, na_ok = TRUE), - class = "epiprocess__version_bound_wrong_length") + class = "epiprocess__version_bound_wrong_length" + ) my_version_bound2 <- c(2, 10) expect_error(validate_version_bound(my_version_bound2, x, na_ok = TRUE), - class = "epiprocess__version_bound_wrong_length") + class = "epiprocess__version_bound_wrong_length" + ) }) test_that("`validate_version_bound` validate and class checks together allow and catch as intended", { @@ -116,7 +120,8 @@ test_that("archive version bounds args work as intended", { regexp = "`clobberable_versions_start`.*indicated that there were later observed versions" ) expect_error(as_epi_archive(update_tbl, versions_end = NA), - class = "epiprocess__version_bound_na_with_na_not_okay") + class = "epiprocess__version_bound_na_with_na_not_okay" + ) ea_default <- as_epi_archive(update_tbl) ea_default %>% epix_as_of(measurement_date + 4L) expect_warning( From 9491797f51651142787cfaef219586ba5d0f022f Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 23 Apr 2024 10:29:51 -0700 Subject: [PATCH 06/18] Note reassignment in R6 migration for mutating functions, + details --- NEWS.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 4d52ded5..1ee3384f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -32,11 +32,15 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat ## Breaking changes - Switched `epi_df`'s `other_keys` default from `NULL` to `character(0)`; PR #390 -- Refactor `epi_archive` to use S3 instead of R6 for its object model. The +- Refactored `epi_archive` to use S3 instead of R6 for its object model. The functionality stay the same, but it will break the member function interface. - For migration, convert `epi_archive$merge` to `epi_archive %>% epix_merge` - (similar for `slide`, `fill_through_version`, `truncate_after_version`, and - `as_of`) (#340). + For migration, you can usually just convert `epi_archive$merge(...)` to + `epi_archive <- epi_archive %>% epix_merge(...)` (and the same for + `fill_through_version` and `truncate_after_version`) and + `epi_archive$slide(...)` to `epi_archive %>% epix_slide(...)` (and the same + for `as_of`, `group_by`, `slide`, etc.) (#340). In some limited situations, + such as if you have a helper function that calls `epi_archive$merge` etc. on + one of its arguments, then you may need to more carefully refactor them. # epiprocess 0.7.0 From 27b798dbfcc427a7534f6cd679d949b9b449eee5 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 24 Apr 2024 10:46:34 -0700 Subject: [PATCH 07/18] Adjust class, typeof, colnames checks & messaging, gcd messaging - S3 class vectors are ordered, so use `identical` - Improve class vector formatting - Tweak other `class` and `typeof` message text - Improve duplicate colnames message - Improve vector interpolation formatting - Fix typo in GCD error messaging --- NAMESPACE | 1 + R/archive.R | 10 +++++----- R/methods-epi_archive.R | 16 ++++++++-------- R/methods-epi_df.R | 9 ++++++--- R/utils.R | 2 +- tests/testthat/test-archive-version-bounds.R | 16 ++++++++-------- tests/testthat/test-methods-epi_df.R | 4 ++-- 7 files changed, 31 insertions(+), 27 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index cc25c7d7..2174b78b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -98,6 +98,7 @@ importFrom(checkmate,test_subset) importFrom(checkmate,vname) importFrom(cli,cli_abort) importFrom(cli,cli_inform) +importFrom(cli,cli_vec) importFrom(cli,cli_warn) importFrom(data.table,":=") importFrom(data.table,address) diff --git a/R/archive.R b/R/archive.R index 5e5ef436..325cdf48 100644 --- a/R/archive.R +++ b/R/archive.R @@ -46,17 +46,17 @@ validate_version_bound <- function(version_bound, x, na_ok = FALSE, ) } } else { - if (!test_set_equal(class(version_bound), class(x[["version"]]))) { + if (!identical(class(version_bound), class(x[["version"]]))) { cli_abort( - "{version_bound_arg} must have the same classes as x$version, - which is {class(x$version)}", + "{version_bound_arg} must have the same `class` vector as x$version, + which has a `class` of {paste(collapse = ' ', deparse(class(x$version)))}", class = "epiprocess__version_bound_mismatched_class" ) } if (!identical(typeof(version_bound), typeof(x[["version"]]))) { cli_abort( - "{version_bound_arg} must have the same type as x$version, - which is {typeof(x$version)}", + "{version_bound_arg} must have the same `typeof` as x$version, + which has a `typeof` of {typeof(x$version)}", class = "epiprocess__version_bound_mismatched_typeof" ) } diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 3c351f37..0d527244 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -68,14 +68,14 @@ epix_as_of <- function(x, max_version, min_time_value = -Inf, all_versions = FAL if (length(other_keys) == 0) other_keys <- NULL # Check a few things on max_version - if (!test_set_equal(class(max_version), class(x$DT$version))) { + if (!identical(class(max_version), class(x$DT$version))) { cli_abort( - "`max_version` must have the same classes as `epi_archive$DT$version`." + "`max_version` must have the same `class` vector as `epi_archive$DT$version`." ) } - if (!test_set_equal(typeof(max_version), typeof(x$DT$version))) { + if (!identical(typeof(max_version), typeof(x$DT$version))) { cli_abort( - "`max_version` must have the same types as `epi_archive$DT$version`." + "`max_version` must have the same `typeof` as `epi_archive$DT$version`." ) } assert_scalar(max_version, na.ok = FALSE) @@ -859,11 +859,11 @@ epix_truncate_versions_after <- function(x, max_version) { #' @rdname epix_truncate_versions_after #' @export epix_truncate_versions_after.epi_archive <- function(x, max_version) { - if (!test_set_equal(class(max_version), class(x$DT$version))) { - cli_abort("`max_version` must have the same classes as `epi_archive$DT$version`.") + if (!identical(class(max_version), class(x$DT$version))) { + cli_abort("`max_version` must have the same `class` as `epi_archive$DT$version`.") } - if (!test_set_equal(typeof(max_version), typeof(x$DT$version))) { - cli_abort("`max_version` must have the same types as `epi_archive$DT$version`.") + if (!identical(typeof(max_version), typeof(x$DT$version))) { + cli_abort("`max_version` must have the same `typeof` as `epi_archive$DT$version`.") } assert_scalar(max_version, na.ok = FALSE) if (max_version > x$versions_end) { diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 97fc4576..526a1171 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -123,6 +123,7 @@ decay_epi_df <- function(x) { #' @param template `epi_df` template to use to restore #' @return `epi_df` or degrade into `tbl_df` #' @importFrom dplyr dplyr_reconstruct +#' @importFrom cli cli_vec #' @export #' @noRd dplyr_reconstruct.epi_df <- function(data, template) { @@ -135,9 +136,11 @@ dplyr_reconstruct.epi_df <- function(data, template) { # Duplicate columns, cli_abort dup_col_names <- cn[duplicated(cn)] if (length(dup_col_names) != 0) { - cli_abort(paste0( - "Column name(s) {unique(dup_col_names)}", - "must not be duplicated." + cli_abort(c( + "Duplicate column names are not allowed", + "i" = "Duplicated column name{?s}: + {cli_vec(unique(dup_col_names), + style = list('vec-sep2' = ', ', 'vec-last' = ', '))}" )) } diff --git a/R/utils.R b/R/utils.R index 0233f775..5662ab4f 100644 --- a/R/utils.R +++ b/R/utils.R @@ -609,7 +609,7 @@ gcd2num <- function(a, b, rrtol = 1e-6, pqlim = 1e6, irtol = 1e-6) { # `b_curr` is the candidate GCD / iterand; check first if it seems too small: if (abs(b_curr) <= iatol) { cli_abort( - "No GCD found; remaining potential Gads are all too small relative + "No GCD found; remaining potential GCDs are all too small relative to one/both of the original inputs; see `irtol` setting." ) } diff --git a/tests/testthat/test-archive-version-bounds.R b/tests/testthat/test-archive-version-bounds.R index a8f12a3f..d36fcab1 100644 --- a/tests/testthat/test-archive-version-bounds.R +++ b/tests/testthat/test-archive-version-bounds.R @@ -51,30 +51,30 @@ test_that("`validate_version_bound` validate and class checks together allow and my_version_bound1 <- `class<-`(24, "c1") expect_error( validate_version_bound(my_version_bound1, x_int, na_ok = FALSE), - regexp = "must have the same classes as" + regexp = "must have the same `class` vector as" ) my_version_bound2 <- `class<-`(list(12), c("c2a", "c2b", "c2c")) - expect_error(validate_version_bound(my_version_bound2, x_list, na_ok = FALSE), regexp = "must have the same classes") + expect_error(validate_version_bound(my_version_bound2, x_list, na_ok = FALSE), regexp = "must have the same `class`") # Want no error matching date to date or datetime to datetime, but no interop due to tz issues: validate_version_bound(my_date, x_date, version_bound_arg = "vb") validate_version_bound(my_datetime, x_datetime, version_bound_arg = "vb") expect_error( validate_version_bound(my_datetime, x_date, na_ok = TRUE, version_bound_arg = "vb"), - regexp = "must have the same classes", + regexp = "must have the same `class`", class = "epiprocess__version_bound_mismatched_class" ) expect_error( validate_version_bound(my_date, x_datetime, na_ok = TRUE, version_bound_arg = "vb"), - regexp = "must have the same classes", + regexp = "must have the same `class`", class = "epiprocess__version_bound_mismatched_class" ) # Bad: - expect_error(validate_version_bound(3.5, x_int, TRUE, "vb"), regexp = "must have the same classes") - expect_error(validate_version_bound(.Machine$integer.max, x_dbl, TRUE, "vb"), regexp = "must have the same classes") + expect_error(validate_version_bound(3.5, x_int, TRUE, "vb"), regexp = "must have the same `class`") + expect_error(validate_version_bound(.Machine$integer.max, x_dbl, TRUE, "vb"), regexp = "must have the same `class`") expect_error(validate_version_bound( `class<-`(list(2), "clazz"), tibble::tibble(version = `class<-`(5L, "clazz")), TRUE, "vb" - ), regexp = "must have the same type", class = "epiprocess__version_bound_mismatched_typeof") + ), regexp = "must have the same `typeof`", class = "epiprocess__version_bound_mismatched_typeof") # Maybe questionable: expect_error(validate_version_bound(3, x_int, TRUE, "vb")) expect_error(validate_version_bound(3L, x_dbl, TRUE, "vb")) @@ -99,7 +99,7 @@ test_that("archive version bounds args work as intended", { clobberable_versions_start = 1241, versions_end = measurement_date ), - regexp = "must have the same classes" + regexp = "must have the same `class`" ) expect_error( as_epi_archive(update_tbl[integer(0L), ]), diff --git a/tests/testthat/test-methods-epi_df.R b/tests/testthat/test-methods-epi_df.R index cff88dac..b071d3ec 100644 --- a/tests/testthat/test-methods-epi_df.R +++ b/tests/testthat/test-methods-epi_df.R @@ -84,11 +84,11 @@ test_that("Subsetting drops & does not drop the epi_df class appropriately", { test_that("When duplicate cols in subset should abort", { expect_error(toy_epi_df[, c(2, 2:3, 4, 4, 4)], - "Column name(s) time_value, y must not be duplicated.", + "Duplicated column names: time_value, y", fixed = TRUE ) expect_error(toy_epi_df[1:4, c(1, 2:4, 1)], - "Column name(s) geo_value must not be duplicated.", + "Duplicated column name: geo_value", fixed = TRUE ) }) From cd7f83cc8dd30d7685017ce6a358bffb80df1543 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 24 Apr 2024 15:06:16 -0700 Subject: [PATCH 08/18] lint: use rlang %||% idiom --- NAMESPACE | 1 + R/archive.R | 19 +++++-------------- R/data.R | 2 +- R/epi_df.R | 4 +--- R/epiprocess.R | 1 + R/growth_rate.R | 10 +++++----- 6 files changed, 14 insertions(+), 23 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 2174b78b..d0c67474 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -144,6 +144,7 @@ importFrom(purrr,map) importFrom(purrr,map_lgl) importFrom(rlang,"!!!") importFrom(rlang,"!!") +importFrom(rlang,"%||%") importFrom(rlang,.data) importFrom(rlang,.env) importFrom(rlang,arg_match) diff --git a/R/archive.R b/R/archive.R index 325cdf48..ffd8b1f0 100644 --- a/R/archive.R +++ b/R/archive.R @@ -316,19 +316,12 @@ new_epi_archive <- function( cli_abort("Column `version` must not contain missing values.") } - # If geo type is missing, then try to guess it - if (is.null(geo_type)) { - geo_type <- guess_geo_type(x$geo_value) - } - - # If time type is missing, then try to guess it - if (is.null(time_type)) { - time_type <- guess_time_type(x$time_value) - } + geo_type <- geo_type %||% guess_geo_type(x$geo_value) + time_type <- time_type %||% guess_time_type(x$time_value) + other_keys <- other_keys %||% character(0L) + additional_metadata <- additional_metadata %||% list() # Finish off with small checks on keys variables and metadata - if (is.null(other_keys)) other_keys <- character(0L) - if (is.null(additional_metadata)) additional_metadata <- list() if (!test_subset(other_keys, names(x))) { cli_abort("`other_keys` must be contained in the column names of `x`.") } @@ -344,9 +337,7 @@ new_epi_archive <- function( # Apply defaults and conduct checks for # `clobberable_versions_start`, `versions_end`: - if (is.null(versions_end)) { - versions_end <- max_version_with_row_in(x) - } + versions_end <- versions_end %||% max_version_with_row_in(x) validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) validate_version_bound(versions_end, x, na_ok = FALSE) if (nrow(x) > 0L && versions_end < max(x[["version"]])) { diff --git a/R/data.R b/R/data.R index cbaaa901..ec677547 100644 --- a/R/data.R +++ b/R/data.R @@ -123,7 +123,7 @@ some_package_is_being_unregistered <- function(parent_n = 0L) { # evaluation has been triggered via `unregister`. simple_call_names <- purrr::map_chr(calls_to_inspect, function(call) { maybe_simple_call_name <- rlang::call_name(call) - if (is.null(maybe_simple_call_name)) NA_character_ else maybe_simple_call_name + maybe_simple_call_name %||% NA_character_ }) # `pkgload::unregister` is an (the?) exported function that forces # package-level promises, while `pkgload:::unregister_namespace` is the diff --git a/R/epi_df.R b/R/epi_df.R index 9ed677cf..f4df1604 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -99,9 +99,7 @@ new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, assert_data_frame(x) assert_list(additional_metadata) - if (is.null(additional_metadata[["other_keys"]])) { - additional_metadata[["other_keys"]] <- character(0L) - } + additional_metadata[["other_keys"]] <- additional_metadata[["other_keys"]] %||% character(0L) # If geo type is missing, then try to guess it if (missing(geo_type)) { diff --git a/R/epiprocess.R b/R/epiprocess.R index e3918708..8981c630 100644 --- a/R/epiprocess.R +++ b/R/epiprocess.R @@ -9,6 +9,7 @@ #' assert_int assert_numeric check_data_frame vname check_atomic #' anyInfinite test_subset test_set_equal checkInt #' @importFrom cli cli_abort cli_inform cli_warn +#' @importFrom rlang %||% #' @name epiprocess "_PACKAGE" utils::globalVariables(c(".x", ".group_key", ".ref_time_value")) diff --git a/R/growth_rate.R b/R/growth_rate.R index f2b326a1..4537375d 100644 --- a/R/growth_rate.R +++ b/R/growth_rate.R @@ -226,11 +226,11 @@ growth_rate <- function(x = seq_along(y), y, x0 = x, k <- params$k # Default parameters - if (is.null(ord)) ord <- 3 - if (is.null(maxsteps)) maxsteps <- 1000 - if (is.null(cv)) cv <- TRUE - if (is.null(df)) df <- "min" - if (is.null(k)) k <- 3 + ord <- ord %||% 3 + maxsteps <- maxsteps %||% 1000 + cv <- cv %||% TRUE + df <- df %||% "min" + k <- k %||% 3 # Check cv and df combo if (is.numeric(df)) cv <- FALSE From 9641bdec7578c05e8886ecd6b6bb92d4ed658d49 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 26 Apr 2024 10:01:51 -0700 Subject: [PATCH 09/18] Use an actual existence-checking [[ instead of pluck --- NAMESPACE | 1 + R/archive.R | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index d0c67474..c444e7fb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -93,6 +93,7 @@ importFrom(checkmate,assert_scalar) importFrom(checkmate,checkInt) importFrom(checkmate,check_atomic) importFrom(checkmate,check_data_frame) +importFrom(checkmate,check_names) importFrom(checkmate,test_set_equal) importFrom(checkmate,test_subset) importFrom(checkmate,vname) diff --git a/R/archive.R b/R/archive.R index ffd8b1f0..a39c39a0 100644 --- a/R/archive.R +++ b/R/archive.R @@ -75,6 +75,8 @@ validate_version_bound <- function(version_bound, x, na_ok = FALSE, #' @return `max(x$version)` if it has any rows; raises error if it has 0 rows or #' an `NA` version value #' +#' @importFrom checkmate check_names +#' #' @export max_version_with_row_in <- function(x) { if (nrow(x) == 0L) { @@ -87,7 +89,8 @@ max_version_with_row_in <- function(x) { class = "epiprocess__max_version_cannot_be_used" ) } else { - version_col <- purrr::pluck(x, "version") # error not NULL if doesn't exist + check_names(names(x), must.include = "version") + version_col <- x[["version"]] if (anyNA(version_col)) { cli_abort("version values cannot be NA", class = "epiprocess__version_values_must_not_be_na" From 7bf29d809687728753cb96947176369ca74c1d23 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 26 Apr 2024 16:52:42 -0700 Subject: [PATCH 10/18] Improve print.epi_archive on empty archives --- R/archive.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/R/archive.R b/R/archive.R index a39c39a0..5124420a 100644 --- a/R/archive.R +++ b/R/archive.R @@ -486,8 +486,12 @@ print.epi_archive <- function(x, ..., class = TRUE, methods = TRUE) { "i" = if (length(setdiff(key(x$DT), c("geo_value", "time_value", "version"))) > 0) { "Non-standard DT keys: {setdiff(key(x$DT), c('geo_value', 'time_value', 'version'))}" }, - "i" = "Min/max time values: {min(x$DT$time_value)} / {max(x$DT$time_value)}", - "i" = "First/last version with update: {min(x$DT$version)} / {max(x$DT$version)}", + "i" = if (nrow(x$DT) != 0L) { + "Min/max time values: {min(x$DT$time_value)} / {max(x$DT$time_value)}" + }, + "i" = if (nrow(x$DT) != 0L) { + "First/last version with update: {min(x$DT$version)} / {max(x$DT$version)}" + }, "i" = if (!is.na(x$clobberable_versions_start)) { "Clobberable versions start: {x$clobberable_versions_start}" }, From e3902db0ec40859c6dd5d4b7fe2ee9ceaeac7db0 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 26 Apr 2024 17:24:33 -0700 Subject: [PATCH 11/18] Improve print.epi_archive in Rmds, capture.output, logs Print to stdout and without using messages for all the output. Prevents Rmds from splitting print output into multiple chunks. Allows `capture.output` by default to capture all expected output, and the same for logging utilities expecting regular output to come from stdout. --- NAMESPACE | 3 ++- R/archive.R | 8 ++++---- R/epiprocess.R | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index c444e7fb..907fc451 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -97,10 +97,11 @@ importFrom(checkmate,check_names) importFrom(checkmate,test_set_equal) importFrom(checkmate,test_subset) importFrom(checkmate,vname) +importFrom(cli,cat_line) importFrom(cli,cli_abort) -importFrom(cli,cli_inform) importFrom(cli,cli_vec) importFrom(cli,cli_warn) +importFrom(cli,format_message) importFrom(data.table,":=") importFrom(data.table,address) importFrom(data.table,as.data.table) diff --git a/R/archive.R b/R/archive.R index 5124420a..95e7eee9 100644 --- a/R/archive.R +++ b/R/archive.R @@ -469,7 +469,7 @@ as_epi_archive <- function(x, geo_type = NULL, time_type = NULL, other_keys = NU #' @param methods Boolean; whether to print all available methods of #' the archive #' -#' @importFrom cli cli_inform +#' @importFrom cli cat_line format_message #' @importFrom rlang check_dots_empty #' @export print.epi_archive <- function(x, ..., class = TRUE, methods = TRUE) { @@ -480,7 +480,7 @@ print.epi_archive <- function(x, ..., class = TRUE, methods = TRUE) { )) } - cli_inform( + cat_line(format_message( c( ">" = if (class) "An `epi_archive` object, with metadata:", "i" = if (length(setdiff(key(x$DT), c("geo_value", "time_value", "version"))) > 0) { @@ -498,9 +498,9 @@ print.epi_archive <- function(x, ..., class = TRUE, methods = TRUE) { "i" = "Versions end: {x$versions_end}", "i" = "A preview of the table ({nrow(x$DT)} rows x {ncol(x$DT)} columns):" ) - ) - + )) print(x$DT[]) + return(invisible(x)) } diff --git a/R/epiprocess.R b/R/epiprocess.R index 8981c630..5ef80739 100644 --- a/R/epiprocess.R +++ b/R/epiprocess.R @@ -8,7 +8,7 @@ #' assert_logical assert_list assert_character assert_class #' assert_int assert_numeric check_data_frame vname check_atomic #' anyInfinite test_subset test_set_equal checkInt -#' @importFrom cli cli_abort cli_inform cli_warn +#' @importFrom cli cli_abort cli_warn #' @importFrom rlang %||% #' @name epiprocess "_PACKAGE" From 6f37e3eda846cf0d4d6719e0bbd14dc453d6d5bd Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 29 Apr 2024 08:29:41 -0700 Subject: [PATCH 12/18] Eliminate single-use, unneeded local var --- R/archive.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/archive.R b/R/archive.R index 95e7eee9..61ab5182 100644 --- a/R/archive.R +++ b/R/archive.R @@ -367,8 +367,7 @@ new_epi_archive <- function( DT <- as.data.table(x, key = key_vars) # nolint: object_name_linter if (!identical(key_vars, key(DT))) setkeyv(DT, cols = key_vars) - maybe_first_duplicate_key_row_index <- anyDuplicated(DT, by = key(DT)) - if (maybe_first_duplicate_key_row_index != 0L) { + if (anyDuplicated(DT, by = key(DT)) != 0L) { cli_abort("`x` must have one row per unique combination of the key variables. If you have additional key variables other than `geo_value`, `time_value`, and `version`, such as an age group column, please specify them in `other_keys`. From 861cdd3d90f01b59eca7ad5c9bec8e8fdd08d6d8 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 29 Apr 2024 09:20:45 -0700 Subject: [PATCH 13/18] Remove outdated doc comment This applied for a different default `clobberable_versions_start`. --- R/methods-epi_archive.R | 1 - man/epix_as_of.Rd | 1 - 2 files changed, 2 deletions(-) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 0d527244..bfec55f8 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -23,7 +23,6 @@ #' @return An `epi_df` object. #' #' @examples -#' # warning message of data latency shown #' epix_as_of( #' archive_cases_dv_subset, #' max_version = max(archive_cases_dv_subset$DT$version) diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd index dc359a7b..1833aad3 100644 --- a/man/epix_as_of.Rd +++ b/man/epix_as_of.Rd @@ -35,7 +35,6 @@ given version. See the \href{https://cmu-delphi.github.io/epiprocess/articles/ar examples. } \examples{ -# warning message of data latency shown epix_as_of( archive_cases_dv_subset, max_version = max(archive_cases_dv_subset$DT$version) From a92aa580cc54445bcdf7bd55e64db9d9655ed8e6 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 29 Apr 2024 09:44:45 -0700 Subject: [PATCH 14/18] More updates due to `clobberable_versions_start` default, bad name - Update `epix_as_of` docs further based on `clobberable_versions_start` now defaulting to `NA`. - Don't include `max_version =` in example `epix_as_of` calls as it seems atypical and a strange name if extracting a snapshot rather than an archive. --- R/methods-epi_archive.R | 45 +++++++++++++++++++++-------------------- man/epix_as_of.Rd | 45 +++++++++++++++++++++-------------------- 2 files changed, 46 insertions(+), 44 deletions(-) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index bfec55f8..b6bf3837 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -30,30 +30,31 @@ #' #' range(archive_cases_dv_subset$DT$version) # 2020-06-02 -- 2021-12-01 #' -#' epix_as_of( -#' archive_cases_dv_subset, -#' max_version = as.Date("2020-06-12") -#' ) +#' epix_as_of(archive_cases_dv_subset, as.Date("2020-06-12")) #' -#' # When fetching a snapshot as of the latest version with update data in the -#' # archive, a warning is issued by default, as this update data might not yet -#' # be finalized (for example, if data versions are labeled with dates, these -#' # versions might be overwritten throughout the corresponding days with -#' # additional data or "hotfixes" of erroroneous data; when we build an archive -#' # based on database queries, the latest available update might still be -#' # subject to change, but previous versions should be finalized). We can -#' # muffle such warnings with the following pattern: -#' withCallingHandlers( -#' { -#' epix_as_of( -#' archive_cases_dv_subset, -#' max_version = max(archive_cases_dv_subset$DT$version) -#' ) -#' }, -#' epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") +#' # --- Advanced: --- +#' +#' # When requesting recent versions of a data set, there can be some +#' # reproducibility issues. For example, requesting data as of the current date +#' # may return different values based on whether today's data is available yet +#' # or not. Other factors include the time it takes between data becoming +#' # available and when you download the data, and whether the data provider +#' # will overwrite ("clobber") version data rather than just publishing new +#' # versions. You can include information about these factors by setting the +#' # `clobberable_versions_start` and `versions_end` of an `epi_archive`, in +#' # which case you will get warnings about potential reproducibility issues: +#' +#' archive_cases_dv_subset2 <- as_epi_archive( +#' archive_cases_dv_subset$DT, +#' # Suppose last version with an update could potentially be rewritten +#' # (a.k.a. "hotfixed", "clobbered", etc.): +#' clobberable_versions_start = max(archive_cases_dv_subset$DT$version), +#' # Suppose today is the following day, and there are no updates out yet: +#' versions_end <- max(archive_cases_dv_subset$DT$version) + 1L, +#' compactify = TRUE #' ) -#' # Since R 4.0, there is a `globalCallingHandlers` function that can be used -#' # to globally toggle these warnings. +#' +#' epix_as_of(archive_cases_dv_subset2, max(archive_cases_dv_subset$DT$version)) #' #' @importFrom data.table between key #' @export diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd index 1833aad3..42b121fa 100644 --- a/man/epix_as_of.Rd +++ b/man/epix_as_of.Rd @@ -42,29 +42,30 @@ epix_as_of( range(archive_cases_dv_subset$DT$version) # 2020-06-02 -- 2021-12-01 -epix_as_of( - archive_cases_dv_subset, - max_version = as.Date("2020-06-12") -) +epix_as_of(archive_cases_dv_subset, as.Date("2020-06-12")) -# When fetching a snapshot as of the latest version with update data in the -# archive, a warning is issued by default, as this update data might not yet -# be finalized (for example, if data versions are labeled with dates, these -# versions might be overwritten throughout the corresponding days with -# additional data or "hotfixes" of erroroneous data; when we build an archive -# based on database queries, the latest available update might still be -# subject to change, but previous versions should be finalized). We can -# muffle such warnings with the following pattern: -withCallingHandlers( - { - epix_as_of( - archive_cases_dv_subset, - max_version = max(archive_cases_dv_subset$DT$version) - ) - }, - epiprocess__snapshot_as_of_clobberable_version = function(wrn) invokeRestart("muffleWarning") +# --- Advanced: --- + +# When requesting recent versions of a data set, there can be some +# reproducibility issues. For example, requesting data as of the current date +# may return different values based on whether today's data is available yet +# or not. Other factors include the time it takes between data becoming +# available and when you download the data, and whether the data provider +# will overwrite ("clobber") version data rather than just publishing new +# versions. You can include information about these factors by setting the +# `clobberable_versions_start` and `versions_end` of an `epi_archive`, in +# which case you will get warnings about potential reproducibility issues: + +archive_cases_dv_subset2 <- as_epi_archive( + archive_cases_dv_subset$DT, + # Suppose last version with an update could potentially be rewritten + # (a.k.a. "hotfixed", "clobbered", etc.): + clobberable_versions_start = max(archive_cases_dv_subset$DT$version), + # Suppose today is the following day, and there are no updates out yet: + versions_end <- max(archive_cases_dv_subset$DT$version) + 1L, + compactify = TRUE ) -# Since R 4.0, there is a `globalCallingHandlers` function that can be used -# to globally toggle these warnings. + +epix_as_of(archive_cases_dv_subset2, max(archive_cases_dv_subset$DT$version)) } From 1cead3036da700af0887a37b0bd19f162aada632 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 29 Apr 2024 10:23:18 -0700 Subject: [PATCH 15/18] fix: grouped_epi_archives are not epi_archives We don't want to try to use an `epi_archive` method implementation on a `grouped_epi_archive`, or have `is_epi_archive` succeed on them even with `grouped_okay = FALSE`, to prevent attempted extraction of nonexistent fields. --- R/grouped_epi_archive.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 140ff9d3..8473ab35 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -80,7 +80,7 @@ new_grouped_epi_archive <- function(x, vars, drop) { list( private = private ), - class = c("grouped_epi_archive", "epi_archive") + class = "grouped_epi_archive" )) } From 38c3322d1166da557f9fd54b9f0c86736aefab09 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 29 Apr 2024 10:51:18 -0700 Subject: [PATCH 16/18] Clean up `clone()` usage - Use new `%>% clone()` when we want a deep copy - Use aliasing instead of shallow copies, since with S3 lists we should not have the threat of mutation of the shallow list structure --- R/grouped_epi_archive.R | 6 +++--- R/methods-epi_archive.R | 2 +- tests/testthat/test-epix_fill_through_version.R | 3 --- tests/testthat/test-epix_merge.R | 3 +-- tests/testthat/test-epix_slide.R | 5 ++--- tests/testthat/test-methods-epi_archive.R | 9 ++++----- 6 files changed, 11 insertions(+), 17 deletions(-) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 8473ab35..7688e0f3 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -370,7 +370,7 @@ epix_slide.grouped_epi_archive <- function(x, f, ..., before, ref_time_values, # DT; convert and wrap: data.table::setattr(.data_group, "sorted", dt_key) data.table::setDT(.data_group, key = dt_key) - .data_group_archive <- as_of_archive %>% clone() + .data_group_archive <- as_of_archive .data_group_archive$DT <- .data_group comp_one_grp(.data_group_archive, .group_key, f = f, ..., @@ -437,8 +437,8 @@ is_grouped_epi_archive <- function(x) { #' @export clone.grouped_epi_archive <- function(x, ...) { - ungrouped <- x$private$ungrouped %>% clone() - new_grouped_epi_archive(ungrouped, x$private$vars, x$private$drop) + x$private$ungrouped <- x$private$ungrouped %>% clone() + x } diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index b6bf3837..5ae75b11 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -533,7 +533,7 @@ epix_detailed_restricted_mutate <- function(.data, ...) { out_dt <- dplyr::dplyr_col_modify(in_tbl, col_modify_cols) %>% data.table::setattr("sorted", data.table::key(.data$DT)) %>% data.table::setDT(key = key(.data$DT)) - out_archive <- .data %>% clone() + out_archive <- .data out_archive$DT <- out_dt request_names <- names(col_modify_cols) return(list( diff --git a/tests/testthat/test-epix_fill_through_version.R b/tests/testthat/test-epix_fill_through_version.R index 89bb4804..b87b26ed 100644 --- a/tests/testthat/test-epix_fill_through_version.R +++ b/tests/testthat/test-epix_fill_through_version.R @@ -59,16 +59,13 @@ test_that("epix_fill_through_version does not mutate x", { as_epi_archive(tibble::tibble(geo_value = 1L, time_value = 1L, version = 1L, value = 10L)) )) { ea_orig_before <- clone(ea_orig) - ea_orig_dt_before_copy <- data.table::copy(ea_orig$DT) some_unobserved_version <- 8L ea_fill_na <- epix_fill_through_version(ea_orig, some_unobserved_version, "na") expect_identical(ea_orig_before, ea_orig) - expect_identical(ea_orig_dt_before_copy, ea_orig$DT) ea_fill_locf <- epix_fill_through_version(ea_orig, some_unobserved_version, "locf") expect_identical(ea_orig_before, ea_orig) - expect_identical(ea_orig_dt_before_copy, ea_orig$DT) } }) diff --git a/tests/testthat/test-epix_merge.R b/tests/testthat/test-epix_merge.R index 9bcc7d67..c29301b8 100644 --- a/tests/testthat/test-epix_merge.R +++ b/tests/testthat/test-epix_merge.R @@ -1,6 +1,5 @@ test_that("epix_merge requires forbids on invalid `y`", { - ea <- archive_cases_dv_subset %>% - clone() + ea <- archive_cases_dv_subset expect_error(epix_merge(ea, data.frame(x = 1))) }) diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index b7a3e946..5c20abc2 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -371,7 +371,6 @@ test_that("epix_slide with all_versions option has access to all older versions" } ea_orig_mirror <- ea %>% clone() - ea_orig_mirror$DT <- data.table::copy(ea_orig_mirror$DT) result1 <- ea %>% group_by() %>% @@ -485,7 +484,7 @@ test_that("epix_as_of and epix_slide with long enough window are compatible", { ) # Test the same sort of thing when grouping by geo in an archive with multiple geos. - ea_multigeo <- ea %>% clone() + ea_multigeo <- ea ea_multigeo$DT <- rbind( ea_multigeo$DT, copy(ea_multigeo$DT)[, geo_value := "y"][, binary := -binary][] @@ -565,7 +564,7 @@ test_that("epix_slide with all_versions option works as intended", { # back depending on the decisions there: # # test_that("`epix_slide` uses `versions_end` as a resulting `epi_df`'s `as_of`", { -# ea_updated_stale = ea %>% clone() +# ea_updated_stale = ea # ea_updated_stale$versions_end <- ea_updated_stale$versions_end + 3 # (dbl) # # # expect_identical( diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index 5be5330f..a5ba48fa 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -1,7 +1,6 @@ library(dplyr) -ea <- archive_cases_dv_subset %>% - clone() +ea <- archive_cases_dv_subset ea2_data <- tibble::tribble( ~geo_value, ~time_value, ~version, ~cases, @@ -32,7 +31,7 @@ test_that("Warning against max_version being clobberable", { expect_warning(regexp = NA, ea %>% epix_as_of(max_version = max(ea$DT$version))) expect_warning(regexp = NA, ea %>% epix_as_of(max_version = min(ea$DT$version))) # but with `clobberable_versions_start` non-`NA`, yes - ea_with_clobberable <- ea %>% clone() + ea_with_clobberable <- ea ea_with_clobberable$clobberable_versions_start <- max(ea_with_clobberable$DT$version) expect_warning(ea_with_clobberable %>% epix_as_of(max_version = max(ea$DT$version))) expect_warning(regexp = NA, ea_with_clobberable %>% epix_as_of(max_version = min(ea$DT$version))) @@ -92,7 +91,7 @@ test_that("epix_truncate_version_after doesn't filter if max_verion at latest ve ea2 <- ea2_data %>% as_epi_archive() - ea_expected <- ea2 %>% clone() + ea_expected <- ea2 ea_as_of <- ea2 %>% epix_truncate_versions_after(max_version = as.Date("2020-06-04")) @@ -120,7 +119,7 @@ test_that("epix_truncate_version_after returns the same groups as input grouped_ as_epi_archive() ea2 <- ea2 %>% group_by(geo_value) - ea_expected <- ea2 %>% clone() + ea_expected <- ea2 ea_as_of <- ea2 %>% epix_truncate_versions_after(max_version = as.Date("2020-06-04")) From 5ea168e88d966db096acd671054099763d5a7559 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 29 Apr 2024 11:05:19 -0700 Subject: [PATCH 17/18] Remove remaining reference to R6 method --- R/grouped_epi_archive.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 7688e0f3..97f5a4c9 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -216,7 +216,7 @@ epix_slide.grouped_epi_archive <- function(x, f, ..., before, ref_time_values, if ("group_by" %in% nse_dots_names(...)) { cli_abort(" The `group_by` argument to `slide` has been removed; please use - the `group_by` S3 generic function or `$group_by` R6 method + the `group_by()` S3 generic function before the slide instead. (If you were instead trying to pass a `group_by` argument to `f` or create a column named `group_by`, this check is a false positive, but you will still need to use a From e61e11a1dec9290ed9f4af5ecac386af31feb90b Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 29 Apr 2024 17:39:18 -0700 Subject: [PATCH 18/18] refactor: changes from review * remove is_epi_archive and delete in epix_slide * simplify group_by_drop_default * prune library calls in tests * remove here and waldo from Suggests * pull most validation work from new_epi_archive into validate_epi_archive * call validate_epi_archive in as_epi_archive --- DESCRIPTION | 2 - NAMESPACE | 3 +- R/archive.R | 161 +++++++++++----------- R/epiprocess.R | 2 +- R/grouped_epi_archive.R | 3 +- R/methods-epi_archive.R | 3 - man/epi_archive.Rd | 20 ++- man/is_epi_archive.Rd | 35 ----- tests/testthat/test-archive.R | 2 - tests/testthat/test-autoplot.R | 3 - tests/testthat/test-compactify.R | 4 - tests/testthat/test-correlation.R | 2 - tests/testthat/test-data.R | 2 +- tests/testthat/test-epix_slide.R | 2 +- tests/testthat/test-methods-epi_archive.R | 5 +- vignettes/archive.Rmd | 9 -- 16 files changed, 102 insertions(+), 156 deletions(-) delete mode 100644 man/is_epi_archive.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 2b53474c..a7a7aa93 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -51,12 +51,10 @@ Suggests: covidcast, devtools, epidatr, - here, knitr, outbreaks, rmarkdown, testthat (>= 3.1.5), - waldo (>= 0.3.1), withr VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index 907fc451..1362b15c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -64,7 +64,6 @@ export(filter) export(group_by) export(group_modify) export(growth_rate) -export(is_epi_archive) export(is_epi_df) export(is_grouped_epi_archive) export(key_colnames) @@ -78,6 +77,7 @@ export(rename) export(slice) export(ungroup) export(unnest) +export(validate_epi_archive) importFrom(checkmate,anyInfinite) importFrom(checkmate,anyMissing) importFrom(checkmate,assert) @@ -94,6 +94,7 @@ importFrom(checkmate,checkInt) importFrom(checkmate,check_atomic) importFrom(checkmate,check_data_frame) importFrom(checkmate,check_names) +importFrom(checkmate,expect_class) importFrom(checkmate,test_set_equal) importFrom(checkmate,test_subset) importFrom(checkmate,vname) diff --git a/R/archive.R b/R/archive.R index 61ab5182..14918678 100644 --- a/R/archive.R +++ b/R/archive.R @@ -307,59 +307,8 @@ new_epi_archive <- function( other_keys = NULL, additional_metadata = NULL, compactify = NULL, - clobberable_versions_start = NA, + clobberable_versions_start = NULL, versions_end = NULL) { - assert_data_frame(x) - if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { - cli_abort( - "Columns `geo_value`, `time_value`, and `version` must be present in `x`." - ) - } - if (anyMissing(x$version)) { - cli_abort("Column `version` must not contain missing values.") - } - - geo_type <- geo_type %||% guess_geo_type(x$geo_value) - time_type <- time_type %||% guess_time_type(x$time_value) - other_keys <- other_keys %||% character(0L) - additional_metadata <- additional_metadata %||% list() - - # Finish off with small checks on keys variables and metadata - if (!test_subset(other_keys, names(x))) { - cli_abort("`other_keys` must be contained in the column names of `x`.") - } - if (any(c("geo_value", "time_value", "version") %in% other_keys)) { - cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") - } - if (any(names(additional_metadata) %in% c("geo_type", "time_type"))) { - cli_warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") - } - - # Conduct checks and apply defaults for `compactify` - assert_logical(compactify, len = 1, any.missing = FALSE, null.ok = TRUE) - - # Apply defaults and conduct checks for - # `clobberable_versions_start`, `versions_end`: - versions_end <- versions_end %||% max_version_with_row_in(x) - validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) - validate_version_bound(versions_end, x, na_ok = FALSE) - if (nrow(x) > 0L && versions_end < max(x[["version"]])) { - cli_abort( - "`versions_end` was {versions_end}, but `x` contained - updates for a later version or versions, up through {max(x$version)}", - class = "epiprocess__versions_end_earlier_than_updates" - ) - } - if (!is.na(clobberable_versions_start) && clobberable_versions_start > versions_end) { - cli_abort( - "`versions_end` was {versions_end}, but a `clobberable_versions_start` - of {clobberable_versions_start} indicated that there were later observed versions", - class = "epiprocess__versions_end_earlier_than_clobberable_versions_start" - ) - } - - # --- End of validation and replacing missing args with defaults --- - # Create the data table; if x was an un-keyed data.table itself, # then the call to as.data.table() will fail to set keys, so we # need to check this, then do it manually if needed @@ -441,6 +390,54 @@ new_epi_archive <- function( ) } +#' `validate_epi_archive` ensures correctness of arguments fed to `as_epi_archive`. +#' +#' @rdname epi_archive +#' +#' @export +validate_epi_archive <- function( + x, + geo_type = NULL, + time_type = NULL, + other_keys = NULL, + additional_metadata = NULL, + compactify = NULL, + clobberable_versions_start = NULL, + versions_end = NULL) { + # Finish off with small checks on keys variables and metadata + if (!test_subset(other_keys, names(x))) { + cli_abort("`other_keys` must be contained in the column names of `x`.") + } + if (any(c("geo_value", "time_value", "version") %in% other_keys)) { + cli_abort("`other_keys` cannot contain \"geo_value\", \"time_value\", or \"version\".") + } + if (any(names(additional_metadata) %in% c("geo_type", "time_type"))) { + cli_warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") + } + + # Conduct checks and apply defaults for `compactify` + assert_logical(compactify, len = 1, any.missing = FALSE, null.ok = TRUE) + + # Apply defaults and conduct checks for + # `clobberable_versions_start`, `versions_end`: + validate_version_bound(clobberable_versions_start, x, na_ok = TRUE) + validate_version_bound(versions_end, x, na_ok = FALSE) + if (nrow(x) > 0L && versions_end < max(x[["version"]])) { + cli_abort( + "`versions_end` was {versions_end}, but `x` contained + updates for a later version or versions, up through {max(x$version)}", + class = "epiprocess__versions_end_earlier_than_updates" + ) + } + if (!is.na(clobberable_versions_start) && clobberable_versions_start > versions_end) { + cli_abort( + "`versions_end` was {versions_end}, but a `clobberable_versions_start` + of {clobberable_versions_start} indicated that there were later observed versions", + class = "epiprocess__versions_end_earlier_than_clobberable_versions_start" + ) + } +} + #' `as_epi_archive` converts a data frame, data table, or tibble into an #' `epi_archive` object. @@ -448,11 +445,36 @@ new_epi_archive <- function( #' @rdname epi_archive #' #' @export -as_epi_archive <- function(x, geo_type = NULL, time_type = NULL, other_keys = NULL, - additional_metadata = list(), - compactify = NULL, - clobberable_versions_start = NA, - versions_end = max_version_with_row_in(x)) { +as_epi_archive <- function( + x, + geo_type = NULL, + time_type = NULL, + other_keys = NULL, + additional_metadata = NULL, + compactify = NULL, + clobberable_versions_start = NULL, + versions_end = NULL) { + assert_data_frame(x) + if (!test_subset(c("geo_value", "time_value", "version"), names(x))) { + cli_abort( + "Columns `geo_value`, `time_value`, and `version` must be present in `x`." + ) + } + if (anyMissing(x$version)) { + cli_abort("Column `version` must not contain missing values.") + } + + geo_type <- geo_type %||% guess_geo_type(x$geo_value) + time_type <- time_type %||% guess_time_type(x$time_value) + other_keys <- other_keys %||% character(0L) + additional_metadata <- additional_metadata %||% list() + clobberable_versions_start <- clobberable_versions_start %||% NA + versions_end <- versions_end %||% max_version_with_row_in(x) + + validate_epi_archive( + x, geo_type, time_type, other_keys, additional_metadata, + compactify, clobberable_versions_start, versions_end + ) new_epi_archive( x, geo_type, time_type, other_keys, additional_metadata, compactify, clobberable_versions_start, versions_end @@ -652,31 +674,6 @@ group_by.epi_archive <- function(.data, ..., .add = FALSE, .drop = dplyr::group_ } -#' Test for `epi_archive` format -#' -#' @param x An object. -#' @param grouped_okay Optional; Boolean; should a `grouped_epi_archive` also -#' count? Default is `FALSE`. -#' @return `TRUE` if the object inherits from `epi_archive`. -#' -#' @export -#' @examples -#' is_epi_archive(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive) -#' is_epi_archive(archive_cases_dv_subset) # TRUE -#' -#' # By default, grouped_epi_archives don't count as epi_archives, as they may -#' # support a different set of operations from regular `epi_archives`. This -#' # behavior can be controlled by `grouped_okay`. -#' grouped_archive <- archive_cases_dv_subset %>% group_by(geo_value) -#' is_epi_archive(grouped_archive) # FALSE -#' is_epi_archive(grouped_archive, grouped_okay = TRUE) # TRUE -#' -#' @seealso [`is_grouped_epi_archive`] -is_epi_archive <- function(x, grouped_okay = FALSE) { - inherits(x, "epi_archive") || grouped_okay && inherits(x, "grouped_epi_archive") -} - - #' Clone an `epi_archive` object. #' #' @param x An `epi_archive` object. diff --git a/R/epiprocess.R b/R/epiprocess.R index 5ef80739..dd7df87a 100644 --- a/R/epiprocess.R +++ b/R/epiprocess.R @@ -7,7 +7,7 @@ #' @importFrom checkmate assert assert_scalar assert_data_frame anyMissing #' assert_logical assert_list assert_character assert_class #' assert_int assert_numeric check_data_frame vname check_atomic -#' anyInfinite test_subset test_set_equal checkInt +#' anyInfinite test_subset test_set_equal checkInt expect_class #' @importFrom cli cli_abort cli_warn #' @importFrom rlang %||% #' @name epiprocess diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 97f5a4c9..55a0176c 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -157,8 +157,7 @@ group_by.grouped_epi_archive <- function( #' #' @export group_by_drop_default.grouped_epi_archive <- function(.tbl) { - x <- .tbl - x$private$drop + .tbl$private$drop } diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 5ae75b11..891cc064 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -797,9 +797,6 @@ epix_slide <- function( as_list_col = FALSE, names_sep = "_", all_versions = FALSE) { - if (!is_epi_archive(x, grouped_okay = TRUE)) { - cli_abort("`x` must be of class `epi_archive` or `grouped_epi_archive`.") - } UseMethod("epix_slide") } diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index b7dd649e..97ff6af0 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -3,6 +3,7 @@ \name{epi_archive} \alias{epi_archive} \alias{new_epi_archive} +\alias{validate_epi_archive} \alias{as_epi_archive} \title{\code{epi_archive} object} \usage{ @@ -13,7 +14,18 @@ new_epi_archive( other_keys = NULL, additional_metadata = NULL, compactify = NULL, - clobberable_versions_start = NA, + clobberable_versions_start = NULL, + versions_end = NULL +) + +validate_epi_archive( + x, + geo_type = NULL, + time_type = NULL, + other_keys = NULL, + additional_metadata = NULL, + compactify = NULL, + clobberable_versions_start = NULL, versions_end = NULL ) @@ -22,10 +34,10 @@ as_epi_archive( geo_type = NULL, time_type = NULL, other_keys = NULL, - additional_metadata = list(), + additional_metadata = NULL, compactify = NULL, - clobberable_versions_start = NA, - versions_end = max_version_with_row_in(x) + clobberable_versions_start = NULL, + versions_end = NULL ) } \arguments{ diff --git a/man/is_epi_archive.Rd b/man/is_epi_archive.Rd deleted file mode 100644 index 06669709..00000000 --- a/man/is_epi_archive.Rd +++ /dev/null @@ -1,35 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive.R -\name{is_epi_archive} -\alias{is_epi_archive} -\title{Test for \code{epi_archive} format} -\usage{ -is_epi_archive(x, grouped_okay = FALSE) -} -\arguments{ -\item{x}{An object.} - -\item{grouped_okay}{Optional; Boolean; should a \code{grouped_epi_archive} also -count? Default is \code{FALSE}.} -} -\value{ -\code{TRUE} if the object inherits from \code{epi_archive}. -} -\description{ -Test for \code{epi_archive} format -} -\examples{ -is_epi_archive(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive) -is_epi_archive(archive_cases_dv_subset) # TRUE - -# By default, grouped_epi_archives don't count as epi_archives, as they may -# support a different set of operations from regular `epi_archives`. This -# behavior can be controlled by `grouped_okay`. -grouped_archive <- archive_cases_dv_subset \%>\% group_by(geo_value) -is_epi_archive(grouped_archive) # FALSE -is_epi_archive(grouped_archive, grouped_okay = TRUE) # TRUE - -} -\seealso{ -\code{\link{is_grouped_epi_archive}} -} diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 2eba383d..1291e3c7 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -1,5 +1,3 @@ -library(dplyr) - test_that("first input must be a data.frame", { expect_error(as_epi_archive(c(1, 2, 3), compactify = FALSE), regexp = "Must be of type 'data.frame'." diff --git a/tests/testthat/test-autoplot.R b/tests/testthat/test-autoplot.R index ba3f8d53..0e4654eb 100644 --- a/tests/testthat/test-autoplot.R +++ b/tests/testthat/test-autoplot.R @@ -1,7 +1,4 @@ -library(dplyr) - d <- as.Date("2020-01-01") - raw_df_chr <- dplyr::bind_rows( dplyr::tibble(geo_value = "ak", time_value = d + 1:5, value = "a"), dplyr::tibble(geo_value = "al", time_value = d + 1:5, value = "d") diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 263d67b7..042a69ea 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -1,7 +1,3 @@ -library(epiprocess) -library(data.table) -library(dplyr) - dt <- archive_cases_dv_subset$DT dt <- filter(dt, geo_value == "ca") %>% filter(version <= "2020-06-15") %>% diff --git a/tests/testthat/test-correlation.R b/tests/testthat/test-correlation.R index fe129616..98507434 100644 --- a/tests/testthat/test-correlation.R +++ b/tests/testthat/test-correlation.R @@ -1,5 +1,3 @@ -library(tibble) - test_that("epi_cor throws an error for a non-epi_df for its first argument", { expect_error(epi_cor(1:10, 1, 1)) expect_error(epi_cor(data.frame(x = 1:10), 1, 1)) diff --git a/tests/testthat/test-data.R b/tests/testthat/test-data.R index 885f0013..88ecc8c7 100644 --- a/tests/testthat/test-data.R +++ b/tests/testthat/test-data.R @@ -1,5 +1,5 @@ test_that("`archive_cases_dv_subset` is formed successfully", { - expect_true(is_epi_archive(archive_cases_dv_subset)) + expect_class(archive_cases_dv_subset, "epi_archive") }) test_that("`delayed_assign_with_unregister_awareness` works as expected on good promises", { diff --git a/tests/testthat/test-epix_slide.R b/tests/testthat/test-epix_slide.R index 5c20abc2..a5b72cbf 100644 --- a/tests/testthat/test-epix_slide.R +++ b/tests/testthat/test-epix_slide.R @@ -506,7 +506,7 @@ test_that("epix_as_of and epix_slide with long enough window are compatible", { test_that("epix_slide `f` is passed an ungrouped `epi_archive` when `all_versions=TRUE`", { slide_fn <- function(x, gk, rtv) { - expect_true(is_epi_archive(x)) + expect_class(x, "epi_archive") return(NA) } diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index a5ba48fa..6686400b 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -1,7 +1,4 @@ -library(dplyr) - ea <- archive_cases_dv_subset - ea2_data <- tibble::tribble( ~geo_value, ~time_value, ~version, ~cases, "ca", "2020-06-01", "2020-06-01", 1, @@ -104,7 +101,7 @@ test_that("epix_truncate_version_after returns the same grouping type as input e ea_as_of <- ea2 %>% epix_truncate_versions_after(max_version = as.Date("2020-06-04")) - expect_true(is_epi_archive(ea_as_of, grouped_okay = FALSE)) + expect_class(ea_as_of, "epi_archive") ea2_grouped <- ea2 %>% group_by(geo_value) diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index 6193981a..a34429d9 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -113,15 +113,6 @@ key(x$DT) In general, the last version of each observation is carried forward (LOCF) to fill in data between recorded versions. -```{r} -original_value <- x$DT$percent_cli[1] -y <- x # This DOES NOT make a copy of x -y$DT$percent_cli[1] <- 0 -head(y$DT) -head(x$DT) -x$DT$percent_cli[1] <- original_value -``` - ## Some details on metadata The following pieces of metadata are included as fields in an `epi_archive`