epiverse-trace · pratikunterwegs · Nov 21, 2023 · Nov 16, 2023 · Nov 16, 2023 · Nov 16, 2023
diff --git a/.gitignore b/.gitignore
@@ -54,3 +54,5 @@ rsconnect/
 # ignore development file
 scratch.R
 inst/doc
+/doc/
+/Meta/
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,9 +1,9 @@
 Package: cfr
-Title: Estimate Disease Severity and Under-Reporting
+Title: Estimate Disease Severity and Case Ascertainment
 Version: 0.1.0
 Authors@R: c(
     person(
-        given = "Pratik", family = "Gupte", role = c("aut", "cre", "cph"), email = "pratik.gupte@lshtm.ac.uk",
+        given = "Pratik R.", family = "Gupte", role = c("aut", "cre", "cph"), email = "pratik.gupte@lshtm.ac.uk",
         comment = c(ORCID = "0000-0001-5294-7819")
         ),
     person(
@@ -16,26 +16,30 @@ Authors@R: c(
         ),
     person(
         given = "Joshua W.", family = "Lambert", role = c("rev"), email = "joshua.lambert@lshtm.ac.uk",
-        comment = c(ORCID = "https://orcid.org/0000-0001-5218-3046")
+        comment = c(ORCID = "0000-0001-5218-3046")
         ),
     person(
         given = "Hugo", family = "Gruson", role = c("rev"), email = "hugo.gruson@data.org",
-        comment = c(ORCID = "https://orcid.org/0000-0002-4094-1476")
+        comment = c(ORCID = "0000-0002-4094-1476")
         ),
     person(
         given = "Tim", family = "Taylor", role = c("rev"), email = "tim.taylor@hiddenelephants.co.uk",
-        comment = c(ORCID = "https://orcid.org/0000-0002-8587-7113")
+        comment = c(ORCID = "0000-0002-8587-7113")
         ),
     person(
         given = "James M.", family = "Azam", role = c("rev"), email = "james.azam@lshtm.ac.uk",
-        comment = c(ORCID = "https://orcid.org/0000-0001-5782-7330")
+        comment = c(ORCID = "0000-0001-5782-7330")
         ),
     person(
         given = "Abdoelnaser M.", family = "Degoot", role = c("rev"), email = "abdoelnaser-mahmood.degoot@lshtm.ac.uk",
-        comment = c(ORCID = "https://orcid.org/0000-0001-8788-2496")
+        comment = c(ORCID = "0000-0001-8788-2496")
+        ),
+    person(
+        given = "Sebastian", family = "Funk", role = c("rev"), email = "sebastian.funk@lshtm.ac.uk",
+        comment = c(ORCID = "0000-0002-2842-3406")
         )
     )
-Description: Estimate the severity of a disease and under-reporting of cases, as discussed in Nishiura et al. (2009) <doi:10.1371/journal.pone.0006852>.
+Description: Estimate the severity of a disease and ascertainment of cases, as discussed in Nishiura et al. (2009) <doi:10.1371/journal.pone.0006852>.
 License: MIT + file LICENSE
 URL: https://github.com/epiverse-trace/cfr, https://epiverse-trace.github.io/cfr/
 BugReports: https://github.com/epiverse-trace/cfr/issues
@@ -57,7 +61,6 @@ Suggests:
     bookdown,
     spelling,
     incidence2,
-    outbreaks,
     data.table,
     distributional,
     distcrete

diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,16 @@
+# cfr 0.1.0
+
+Initial CRAN submission of _cfr_, an R package to estimate the severity of a disease and ascertainment of cases while correcting for delays in outcomes of reported cases being known.
+
+This release includes:
+
+1. Functions for the overall severity of an outbreak, the overall severity of an outbreak estimated with an expanding time series of data, and the time-varying severity of an outbreak,
+2. A function to estimate the number of outcomes to be expected from a given number of cases assuming a user-specified distribution of delays between cases and outcomes being known,
+3. A function to estimate the overall (static) or time-varying ascertainment of cases in an outbreak by comparing the relevant severity measures against a user-specified baseline severity,
+4. A data preparation generic with an S3 method for the `<incidence2>` class from the _incidence2_ package,
+5. Example daily case and death data from the 1976 Ebola Virus Disease outbreak as reported in Camacho et al. (2014). <https://doi.org/10.1016/j.epidem.2014.09.003>,
+6. Example daily case and death data from the Covid-19 pandemic over the range 2020-01-01 to 2022-12-31 from the 19 countries with over 100,00 deaths over this period, as taken from the _covidregionaldata_ package which is no longer on CRAN,
+7. Vignettes describing how to get started with severity estimation, and more detailed workflows on different kinds of severity estimation,
+8. A vignette on working with data from the _incidence2_ package, and a vignette on working with delay distributions,
+9. 100% code coverage,
+10. Workflows to render the vignettes and README as a website.
diff --git a/R/cfr_rolling.R b/R/cfr_rolling.R
@@ -33,6 +33,7 @@
 #'
 #' # estimate severity for each day while correcting for delays
 #' # obtain onset-to-death delay distribution parameters from Barry et al. 2018
+#' # The Lancet. <https://doi.org/10.1016/S0140-6736(18)31387-4>
 #' # view only the first values
 #' estimate <- cfr_rolling(
 #'   ebola1976,
@@ -71,7 +72,7 @@ cfr_rolling <- function(data,
     # this may need more thought for dates that are integers, POSIXct,
     # or other units; consider the units package
   )
-  checkmate::assert_count(poisson_threshold)
+  checkmate::assert_count(poisson_threshold, positive = TRUE)
 
   # NOTE: delay_density is checked in estimate_outcomes() if passed and not NULL
 
@@ -121,10 +122,8 @@ cfr_rolling <- function(data,
 
     cfr_lims <- Map(
       cumulative_deaths, cumulative_cases,
-      f = stats::binom.test, p = 1
+      f = function(x, n) stats::binom.test(x, n, p = 1)$conf.int
     )
-    # bind list elements together
-    cfr_lims <- lapply(cfr_lims, `[[`, "conf.int")
     cfr_lims <- do.call(rbind, cfr_lims)
 
     # assign to matrix

diff --git a/R/cfr_static.R b/R/cfr_static.R
@@ -29,7 +29,7 @@
 #' `function(x) stats::dgamma(x = x, shape = 5, scale = 1)`.
 #'
 #' @param poisson_threshold The case count above which to use Poisson
-#' approximation. Set to 100 by default.
+#' approximation. Set to 100 by default. Must be > 0.
 #'
 #' @details
 #' # Details: Adjusting for delays between two time series
@@ -86,6 +86,7 @@
 #'
 #' # estimate severity for each day while correcting for delays
 #' # obtain onset-to-death delay distribution parameters from Barry et al. 2018
+#' # The Lancet. <https://doi.org/10.1016/S0140-6736(18)31387-4>
 #' cfr_static(
 #'   ebola1976,
 #'   delay_density = function(x) dgamma(x, shape = 2.40, scale = 3.33)
@@ -119,7 +120,7 @@ cfr_static <- function(data,
     # this may need more thought for dates that are integers, POSIXct,
     # or other units; consider the units package
   )
-  checkmate::assert_count(poisson_threshold)
+  checkmate::assert_count(poisson_threshold, positive = TRUE)
 
   # NOTE: delay_density is checked in estimate_outcomes() if passed and not NULL
 

diff --git a/R/cfr_time_varying.R b/R/cfr_time_varying.R
@@ -6,7 +6,7 @@
 #' (e.g. onset-to-death for the fatality risk).
 #'
 #' @inheritParams cfr_static
-#' @param burn_in A single integer value for the number of time-points
+#' @param burn_in A single integer-like value for the number of time-points
 #' (typically days) to disregard at the start of the time-series, if a burn-in
 #' period is desired.
 #'
@@ -54,6 +54,13 @@
 #' \eqn{t}, parameterised with disease-specific parameters before it is supplied
 #' here.
 #'
+#' **Note** that the function arguments `burn_in` and `smoothing_window` are not
+#' explicitly used in this calculation. `burn_in` controls how many estimates at
+#' the beginning of the outbreak are replaced with `NA`s --- the calculation
+#' above is not applied to the first `burn_in` data points.
+#' The calculation is applied to the smoothed data, if a `smoothing_window`
+#' is specified.
+#'
 #' @references
 #' Nishiura, H., Klinkenberg, D., Roberts, M., & Heesterbeek, J. A. P. (2009).
 #' Early Epidemiological Assessment of the Virulence of Emerging Infectious
@@ -77,6 +84,7 @@
 #'
 #' # estimate time varying severity while correcting for delays
 #' # obtain onset-to-death delay distribution parameters from Linton et al. 2020
+#' # J. Clinical Medicine: <https://doi.org/10.3390/jcm9020538>
 #' # view only the first values
 #' cfr_time_varying <- cfr_time_varying(
 #'   data = df_covid_uk,
@@ -90,8 +98,8 @@ cfr_time_varying <- function(data,
                              burn_in = 7,
                              smoothing_window = NULL) {
   # input checking
-  # expect an integer-like number
-  checkmate::assert_int(burn_in, lower = 0)
+  # zero count allowed to include all data
+  checkmate::assert_count(burn_in)
 
   # expect rows more than burn in value
   checkmate::assert_data_frame(data, min.cols = 3, min.rows = burn_in + 1)
@@ -105,9 +113,13 @@ cfr_time_varying <- function(data,
     data[, c("date", "cases", "deaths")],
     any.missing = FALSE
   )
-  checkmate::assert_number(smoothing_window, lower = 1, null.ok = TRUE)
+  # check that data$date is a date column
+  checkmate::assert_date(data$date, any.missing = FALSE, all.missing = FALSE)
+  checkmate::assert_count(smoothing_window, null.ok = TRUE)
 
   stopifnot(
+    "Input data must have sequential dates with none missing or duplicated" =
+      identical(unique(diff(data$date)), 1), # use numeric 1, not integer
     "`smoothing_window` must be an odd number greater than 0" =
       (smoothing_window %% 2 != 0),
     "`delay_density` must be a function with a single required argument,
@@ -141,14 +153,6 @@ cfr_time_varying <- function(data,
     )
   }
 
-  cases <- df_temp$cases
-
-  case_times <- as.numeric(df_temp$date - min(df_temp$date, na.rm = TRUE),
-    units = "days"
-  ) + 1
-
-  case_length <- length(case_times)
-
   ##### prepare matrix for severity estimation ####
   # when not correcting for delays, set estimated no. of known outcomes to cases
   # this is to avoid if-else ladders
@@ -163,23 +167,13 @@ cfr_time_varying <- function(data,
   # calculation of indices to modify
   # start from the final index to be smoothed
   # end at the first row after the burn-in number of rows (days)
-  # TODO: check if modifying start point is necessary for runmed "keep" endrule
-  indices <- seq(case_length - smoothing_window, burn_in + 1, -1)
+  indices <- seq(nrow(data) - smoothing_window, burn_in + 1, -1)
   if (!is.null(delay_density)) {
     pmf_vals <- delay_density(seq(from = 0, to = nrow(data) - 1L))
 
-    df_temp[indices, "estimated_outcomes"] <- vapply(
-      X = indices,
-      FUN = function(x) {
-        delay_pmf_eval <- pmf_vals[case_times[seq_len(x - burn_in)]]
-        known_onsets_current <- cases[seq_len(x - burn_in)] *
-          rev(delay_pmf_eval)
-
-        # return rounded sum of known_onsets_current
-        round(sum(known_onsets_current, na.rm = TRUE))
-      },
-      FUN.VALUE = numeric(1)
-    )
+    df_temp[["estimated_outcomes"]] <- round(.convolve_cases_pmfs(
+      cases = df_temp$cases, pmf_vals = pmf_vals
+    ))
   }
 
   #### Get severity estimates ####
@@ -194,28 +188,22 @@ cfr_time_varying <- function(data,
   )
 
   # binomial test at indices
-  estimates_tmp <- lapply(indices, FUN = function(i) {
-    severity_current_estimate <- stats::binom.test(
+  estimates_tmp <- vapply(indices, FUN = function(i) {
+    severity_estimate <- stats::binom.test(
       df_temp$deaths[i],
       df_temp$estimated_outcomes[i]
     )
 
     # return a vector
     c(
-      severity_current_estimate$estimate[[1]],
-      severity_current_estimate$conf.int[[1]],
-      severity_current_estimate$conf.int[[2]]
+      severity_estimate$estimate[[1]],
+      severity_estimate$conf.int[[1]],
+      severity_estimate$conf.int[[2]]
     )
-  })
+  }, numeric(3))
 
-  # create matrix to replace values of severity_estimates at indices
-  estimates_tmp <- matrix(
-    unlist(estimates_tmp),
-    nrow = length(indices),
-    byrow = TRUE
-  )
   # replace the values at indices
-  severity_estimates[indices, ] <- estimates_tmp
+  severity_estimates[indices, ] <- t(estimates_tmp)
 
   # create data frame
   severity_estimates <- as.data.frame(severity_estimates)

diff --git a/R/covid_data.R b/R/covid_data.R
@@ -1,9 +1,10 @@
 #' Daily Covid-19 case and death data for countries with 100,000 or more deaths
 #'
-#' Data adapted from the `covidregionaldata` package of daily cases and
+#' Data adapted from the \{covidregionaldata\} package of daily cases and
 #' deaths from the 19 countries with 100,000 or more deaths over the period
 #' 2020-01-01 to 2022-12-31. See the **References** for the publication which
-#' links to data sources made available through `covidregionaldata`.
+#' links to data sources made available through \{covidregionaldata\}.
+#' Included as \{covidregionaldata\} is no longer on CRAN.
 #' Data are provided as a `<data.frame>`.
 #'
 #' @format ## `covid_data`

diff --git a/R/estimate_ascertainment.R b/R/estimate_ascertainment.R
@@ -17,10 +17,6 @@
 #' assumed true baseline severity estimate used to estimate the overall
 #' ascertainment ratio. Missing by default, which causes the function to error;
 #' must be supplied by the user.
-#' @param max_date A `Date` representing a user supplied maximum date, up to
-#' which the time-varying severity estimate will be calculated. Useful in the
-#' case of long time-series, where the user wishes to focus on a specific
-#' time-period. See [as.Date()] for converting a string to a `Date`.
 #'
 #' @return A `<data.frame>` containing the maximum likelihood estimate estimate
 #' and 95% confidence interval of the corrected severity, named
@@ -36,27 +32,27 @@
 #' df_covid_uk_subset <- subset(df_covid_uk, date <= "2020-05-31")
 #'
 #' # use a severity baseline of 1.4% (0.014) taken from Verity et al. (2020)
-#' # Lancet Infectious Diseases: 10.1016/S1473-3099(20)30243-7
+#' # Lancet Infectious Diseases: <https://doi.org/10.1016/S1473-3099(20)30243-7>
 #'
 #' # use onset-to-death distribution from Linton et al. (2020)
-#' # J. Clinical Medicine: 10.3390/jcm9020538
+#' # J. Clinical Medicine: <https://doi.org/10.3390/jcm9020538>
 #'
+#' # subset data until 30th June 2020
+#' data <- df_covid_uk[df_covid_uk$date <= "2020-06-30", ]
 #' estimate_ascertainment(
-#'   data = df_covid_uk,
+#'   data = data,
 #'   delay_density = function(x) dlnorm(x, meanlog = 2.577, sdlog = 0.440),
 #'   type = "varying",
 #'   severity_baseline = 0.014,
-#'   burn_in = 7L,
-#'   max_date = as.Date("2020-06-30")
+#'   burn_in = 7L
 #' )
 #'
 estimate_ascertainment <- function(data,
+                                   severity_baseline,
                                    delay_density = NULL,
                                    type = c("static", "varying"),
-                                   severity_baseline,
                                    burn_in = 7,
-                                   smoothing_window = NULL,
-                                   max_date = NULL) {
+                                   smoothing_window = NULL) {
   # input checking
   # expect rows more than burn in value
   checkmate::assert_data_frame(data, min.cols = 3, min.rows = burn_in + 1)
@@ -75,8 +71,8 @@ estimate_ascertainment <- function(data,
     severity_baseline,
     lower = 0.0, upper = 1.0, finite = TRUE
   )
-  checkmate::assert_int(burn_in, lower = 0)
-  checkmate::assert_date(max_date, null.ok = TRUE)
+  # zero count allowed to include all data
+  checkmate::assert_count(burn_in)
 
   # NOTE: delay_density is checked in estimate_outcomes() if passed and not NULL
 
@@ -99,46 +95,37 @@ estimate_ascertainment <- function(data,
 
       df_sev <- df_sev[!is.na(df_sev$severity_mean), ]
 
-      # collect the severity at the last date, or the date specified by
-      # the user in `max_date`
-      if (!is.null(max_date)) {
-        df_sev <- df_sev[df_sev$date == max_date, ]
-      } else {
-        df_sev <- df_sev[df_sev$date == max(df_sev$date), ]
-      }
+      # collect the severity at the last date in the data
+      df_sev <- df_sev[df_sev$date == max(df_sev$date), ]
 
       # subset column names
-      df_sev <- df_sev[
-        ,
-        grepl("severity", colnames(df_sev), fixed = TRUE)
-      ]
+      df_sev <- df_sev[, grepl("severity", colnames(df_sev), fixed = TRUE)]
     }
   )
 
   # data.frame for exports, first scale values by the 1/severity baseline
   # then ensure maximum is 1.0
-  df_severity <- severity_baseline / df_severity
+  df_ascertainment <- severity_baseline / df_severity
   # throw a warning for ascertainment ration > 1.0
-  if (any(df_severity > 1.0)) {
+  if (any(df_ascertainment > 1.0)) {
     warning(
       "Ascertainment ratios > 1.0 detected, setting these values to 1.0"
     )
   }
-  df_severity[df_severity > 1.0] <- 1.0
+  df_ascertainment[df_ascertainment > 1.0] <- 1.0
 
   # reset row numbers, which might be confusing
-  rownames(df_severity) <- NULL
+  rownames(df_ascertainment) <- NULL
 
   # re-convert to data.frame from list
   # here, the estimate called "severity_mean" translates to "ascertainment_me"
   # and the estimate "severity_high" translates to "ascertainment_lo"
-  # TODO: check if this is correct
-  colnames(df_severity) <- c(
+  colnames(df_ascertainment) <- c(
     "ascertainment_mean", "ascertainment_high", "ascertainment_low"
   )
 
   # return data with columns in correct order
-  df_severity[, c(
+  df_ascertainment[, c(
     "ascertainment_mean", "ascertainment_low", "ascertainment_high"
   )]
 }