bailey-lab · arisp99 · Feb 24, 2022 · Feb 17, 2022 · Feb 17, 2022 · Feb 17, 2022
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -80,5 +80,6 @@ repos:
 exclude: >
   (?x)^(
   tests/testthat/_snaps/.*|
-  .*\.Rd
+  .*\.Rd|
+  tests/testthat/issue_18.rds|
   )$
diff --git a/R/process.R b/R/process.R
@@ -101,31 +101,6 @@ process <- function(wsmaf,
     )
   }
 
-  # In some instances, Hmisc::cut2 assigns a cut with only 1 number in it.
-  # If this happens and we try to group our data, this can mess up our data.
-  # Therefore, to account for this, we find all instances where this occurs
-  # and combine these factors with the previous factor.
-  one_point <- !stringr::str_starts(levels(df$plmaf_cut), "\\[")
-
-  # We find all places where we only have one point. But, we ignore the case
-  # where the one point is the first break (0).
-  if (sum(one_point) > 1 | (sum(one_point) == 1 & which(one_point)[1] != 1)) {
-    if (which(one_point)[1] == 1) {
-      # When 0 is its own break, we ignore it and store all the other locations
-      points <- which(one_point)[-1]
-    } else {
-      # When 0 is not its own break, we store all locations
-      points <- which(one_point)
-    }
-
-    # We make a list of the factor names of all the points we want to remove. We
-    # name the list with the previous factor, and combine the two together. This
-    # effectively puts the points in the single factor into the previous one.
-    point_list <- c(levels(df$plmaf_cut)[points])
-    names(point_list) <- levels(df$plmaf_cut)[points - 1]
-    df$plmaf_cut <- forcats::fct_recode(df$plmaf_cut, !!!point_list)
-  }
-
   # Average over intervals of PLMAF
   df_grouped <- df %>%
     dplyr::group_by(.data$plmaf_cut, .drop = FALSE) %>%
@@ -135,21 +110,44 @@ process <- function(wsmaf,
     ) %>%
     stats::na.omit()
 
-  # Find the cuts for our data
-  cuts <- suppressWarnings(Hmisc::cut2(plmaf, m = bin_size, onlycuts = TRUE))
-  if (sum(one_point) > 1 | (sum(one_point) == 1 & which(one_point)[1] != 1)) {
-    cuts <- cuts[-points]
-  }
-
-  # We then find our midpoints
-  df_grouped$midpoints <- cuts[-length(cuts)] + diff(cuts) / 2
+  # Compute midpoints
+  df_grouped_mid <- find_cut_midpoints(df_grouped, .data$plmaf_cut)
 
   # Return data, seq_error, and cuts
   list(
-    data = df_grouped,
+    data = df_grouped_mid,
     seq_error = seq_error,
     bin_size = bin_size,
-    cuts = cuts
+    cuts = suppressWarnings(Hmisc::cut2(plmaf, m = bin_size, onlycuts = TRUE))
+  )
+}
+
+find_cut_midpoints <- function(data, cuts) {
+  # Convert single cuts to the standard format: "[lower,upper)"
+  fix_single_cuts <- dplyr::mutate(
+    data,
+    fixed_cuts = as.character({{ cuts }}),
+    fixed_cuts = ifelse(
+      !stringr::str_starts(.data$fixed_cuts, "\\["),
+      glue::glue("[{.data$fixed_cuts},{.data$fixed_cuts})"),
+      .data$fixed_cuts
+    )
+  )
+
+  # Find lower and upper bounds
+  extract_bounds <- tidyr::extract(
+    data = fix_single_cuts,
+    col = .data$fixed_cuts,
+    into = c("lower", "upper"),
+    regex = "([[:alnum:]].+),([[:alnum:]].+)[\\]\\)]",
+    convert = TRUE
+  )
+
+  # Determine cut midpoints
+  dplyr::mutate(
+    extract_bounds,
+    midpoints = (.data$upper + .data$lower) / 2,
+    .keep = "unused"
   )
 }
 

diff --git a/tests/testthat/_snaps/process.md b/tests/testthat/_snaps/process.md
@@ -0,0 +1,17 @@
+# cut midpoints work for a large data set
+
+    Code
+      compute_coi(data, "real", seq_error = data$seq_error, bin_size = data$bin_size,
+      coi_method = data$coi_method)
+    Output
+      $coi
+      [1] 1
+
+      $probability
+       [1] 0.7589392288 0.0970897028 0.0434247877 0.0246155376 0.0159032233
+       [6] 0.0111576164 0.0082839487 0.0064095253 0.0051176318 0.0041885658
+      [11] 0.0034974474 0.0029689706 0.0025555018 0.0022256993 0.0019582463
+      [16] 0.0017382291 0.0015549565 0.0014005993 0.0012693162 0.0011566764
+      [21] 0.0010592684 0.0009744307 0.0009000617 0.0008344838 0.0007763443
+
+
diff --git a/tests/testthat/issue_18.rds b/tests/testthat/issue_18.rds
diff --git a/tests/testthat/test-process.R b/tests/testthat/test-process.R
@@ -0,0 +1,48 @@
+computed_midpoints <- function(data, cuts) {
+  find_cut_midpoints(data, cuts) %>%
+    dplyr::distinct() %>%
+    dplyr::pull(midpoints) %>%
+    sort()
+}
+
+test_that("can find cut midpoints", {
+  set.seed(100)
+  plmaf <- runif(100, 0, 0.5)
+  data <- data.frame(cuts = Hmisc::cut2(plmaf, m = 25))
+
+  expect_equal(
+    computed_midpoints(data, cuts),
+    c(0.08455, 0.20625, 0.31635, 0.43475)
+  )
+})
+
+test_that("can find cut midpoints even if there is a cut with one value", {
+  data <- data.frame(cuts = as.factor(c(
+    "[0.1861,0.311)", "[0.0219,0.186)", "[0.3110,0.355]", "0.0000", "0.4750"
+  )))
+
+  expect_equal(
+    computed_midpoints(data, cuts),
+    c(0.0000, 0.10395, 0.24855, 0.333, 0.4750),
+    tolerance = 0.001
+  )
+})
+
+test_that("cuts can end with a `(` or a `[`", {
+  data <- data.frame(cuts = as.factor(c("[0.1861,0.311)", "[0.0219,0.186]")))
+  expect_equal(computed_midpoints(data, cuts), c(0.10395, 0.24855))
+})
+
+test_that("cut midpoints work for a large data set (#18)", {
+  # The data set we use here used to fail when running our algorithms
+  data <- readRDS("issue_18.rds")
+  expect_snapshot(
+    compute_coi(
+      data,
+      "real",
+      seq_error = data$seq_error,
+      bin_size = data$bin_size,
+      coi_method = data$coi_method
+    )
+  )
+})