Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix midpoint calculation for cuts #19

Merged
merged 7 commits into from
Feb 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,5 +80,6 @@ repos:
exclude: >
(?x)^(
tests/testthat/_snaps/.*|
.*\.Rd
.*\.Rd|
tests/testthat/issue_18.rds|
)$
68 changes: 33 additions & 35 deletions R/process.R
Original file line number Diff line number Diff line change
Expand Up @@ -101,31 +101,6 @@ process <- function(wsmaf,
)
}

# In some instances, Hmisc::cut2 assigns a cut with only 1 number in it.
# If this happens and we try to group our data, this can mess up our data.
# Therefore, to account for this, we find all instances where this occurs
# and combine these factors with the previous factor.
one_point <- !stringr::str_starts(levels(df$plmaf_cut), "\\[")

# We find all places where we only have one point. But, we ignore the case
# where the one point is the first break (0).
if (sum(one_point) > 1 | (sum(one_point) == 1 & which(one_point)[1] != 1)) {
if (which(one_point)[1] == 1) {
# When 0 is its own break, we ignore it and store all the other locations
points <- which(one_point)[-1]
} else {
# When 0 is not its own break, we store all locations
points <- which(one_point)
}

# We make a list of the factor names of all the points we want to remove. We
# name the list with the previous factor, and combine the two together. This
# effectively puts the points in the single factor into the previous one.
point_list <- c(levels(df$plmaf_cut)[points])
names(point_list) <- levels(df$plmaf_cut)[points - 1]
df$plmaf_cut <- forcats::fct_recode(df$plmaf_cut, !!!point_list)
}

# Average over intervals of PLMAF
df_grouped <- df %>%
dplyr::group_by(.data$plmaf_cut, .drop = FALSE) %>%
Expand All @@ -135,21 +110,44 @@ process <- function(wsmaf,
) %>%
stats::na.omit()

# Find the cuts for our data
cuts <- suppressWarnings(Hmisc::cut2(plmaf, m = bin_size, onlycuts = TRUE))
if (sum(one_point) > 1 | (sum(one_point) == 1 & which(one_point)[1] != 1)) {
cuts <- cuts[-points]
}

# We then find our midpoints
df_grouped$midpoints <- cuts[-length(cuts)] + diff(cuts) / 2
# Compute midpoints
df_grouped_mid <- find_cut_midpoints(df_grouped, .data$plmaf_cut)

# Return data, seq_error, and cuts
list(
data = df_grouped,
data = df_grouped_mid,
seq_error = seq_error,
bin_size = bin_size,
cuts = cuts
cuts = suppressWarnings(Hmisc::cut2(plmaf, m = bin_size, onlycuts = TRUE))
)
}

find_cut_midpoints <- function(data, cuts) {
# Convert single cuts to the standard format: "[lower,upper)"
fix_single_cuts <- dplyr::mutate(
data,
fixed_cuts = as.character({{ cuts }}),
fixed_cuts = ifelse(
!stringr::str_starts(.data$fixed_cuts, "\\["),
glue::glue("[{.data$fixed_cuts},{.data$fixed_cuts})"),
.data$fixed_cuts
)
)

# Find lower and upper bounds
extract_bounds <- tidyr::extract(
data = fix_single_cuts,
col = .data$fixed_cuts,
into = c("lower", "upper"),
regex = "([[:alnum:]].+),([[:alnum:]].+)[\\]\\)]",
convert = TRUE
)

# Determine cut midpoints
dplyr::mutate(
extract_bounds,
midpoints = (.data$upper + .data$lower) / 2,
.keep = "unused"
)
}

Expand Down
17 changes: 17 additions & 0 deletions tests/testthat/_snaps/process.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# cut midpoints work for a large data set

Code
compute_coi(data, "real", seq_error = data$seq_error, bin_size = data$bin_size,
coi_method = data$coi_method)
Output
$coi
[1] 1

$probability
[1] 0.7589392288 0.0970897028 0.0434247877 0.0246155376 0.0159032233
[6] 0.0111576164 0.0082839487 0.0064095253 0.0051176318 0.0041885658
[11] 0.0034974474 0.0029689706 0.0025555018 0.0022256993 0.0019582463
[16] 0.0017382291 0.0015549565 0.0014005993 0.0012693162 0.0011566764
[21] 0.0010592684 0.0009744307 0.0009000617 0.0008344838 0.0007763443


Binary file added tests/testthat/issue_18.rds
Binary file not shown.
48 changes: 48 additions & 0 deletions tests/testthat/test-process.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
computed_midpoints <- function(data, cuts) {
find_cut_midpoints(data, cuts) %>%
dplyr::distinct() %>%
dplyr::pull(midpoints) %>%
sort()
}

test_that("can find cut midpoints", {
set.seed(100)
plmaf <- runif(100, 0, 0.5)
data <- data.frame(cuts = Hmisc::cut2(plmaf, m = 25))

expect_equal(
computed_midpoints(data, cuts),
c(0.08455, 0.20625, 0.31635, 0.43475)
)
})

test_that("can find cut midpoints even if there is a cut with one value", {
data <- data.frame(cuts = as.factor(c(
"[0.1861,0.311)", "[0.0219,0.186)", "[0.3110,0.355]", "0.0000", "0.4750"
)))

expect_equal(
computed_midpoints(data, cuts),
c(0.0000, 0.10395, 0.24855, 0.333, 0.4750),
tolerance = 0.001
)
})

test_that("cuts can end with a `(` or a `[`", {
data <- data.frame(cuts = as.factor(c("[0.1861,0.311)", "[0.0219,0.186]")))
expect_equal(computed_midpoints(data, cuts), c(0.10395, 0.24855))
})

test_that("cut midpoints work for a large data set (#18)", {
# The data set we use here used to fail when running our algorithms
data <- readRDS("issue_18.rds")
expect_snapshot(
compute_coi(
data,
"real",
seq_error = data$seq_error,
bin_size = data$bin_size,
coi_method = data$coi_method
)
)
})