From 10e8f9f1a543fa8b977137a5ffd4eaa08ea217d3 Mon Sep 17 00:00:00 2001 From: Zargham Ahmad Date: Fri, 28 Jun 2024 15:47:58 +0000 Subject: [PATCH 1/5] added grouping thresold --- R/remove_noise.R | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/R/remove_noise.R b/R/remove_noise.R index 07ca42d..d0e96a2 100644 --- a/R/remove_noise.R +++ b/R/remove_noise.R @@ -70,7 +70,8 @@ remove_noise <- function(filename, baseline_correct_noise_percentile, intensity_weighted, do.plot, - cache) { + cache, + grouping_threshold = 0) { raw.data <- load_file(filename) raw.prof <- adaptive.bin( @@ -87,10 +88,28 @@ remove_noise <- function(filename, raw.prof$features$intensities, raw.prof$features$grps ) - + run.sel <- raw.prof$height.rec[which(raw.prof$height.rec[, 2] >= raw.prof$min.count.run * min_pres & raw.prof$height.rec[, 3] > baseline_correct), 1] newprof <- newprof[newprof[, 4] %in% run.sel, ] + + if (grouping_threshold > 0) { + sorted_newprof <- newprof[order(newprof[,2]),] + new_grps <- cumsum(c(0, diff(sorted_newprof[,2])) > grouping_threshold) + sorted_newprof <- cbind(sorted_newprof, new_grps, deparse.level = 0) + + sorted_newprof_df <- tibble::as_tibble(sorted_newprof) + + newprof <- as.matrix(sorted_newprof_df |> + dplyr::group_by(V4, V5) |> + dplyr::mutate(cluster = cur_group_id()) |> + dplyr::ungroup() |> + dplyr::arrange(cluster) |> + dplyr::select(-V4, -V5) + ) + colnames(newprof) <- NULL + } + new.prof <- run_filter( newprof, min_pres = min_pres, From debf57c05c593a7925ea14fd83430e61c0754e6f Mon Sep 17 00:00:00 2001 From: KristinaGomoryova Date: Tue, 9 Jul 2024 09:26:11 +0200 Subject: [PATCH 2/5] grouping_threshold added to the remove_noise function --- R/remove_noise.R | 5 +++-- conda/environment-dev.yaml | 2 +- tests/remote-files/input.txt | 3 ++- tests/testthat/test-remove_noise.R | 30 ++++++++++++++++++++++++++++++ 4 files changed, 36 insertions(+), 4 deletions(-) diff --git a/R/remove_noise.R b/R/remove_noise.R index d0e96a2..e4ebc74 100644 --- a/R/remove_noise.R +++ b/R/remove_noise.R @@ -60,6 +60,7 @@ load_data <- function(filename, #' @param intensity_weighted Whether to use intensity to weight mass density estimation. #' @param do.plot Indicates whether plot should be drawn. #' @param cache Whether to use cache +#' @param grouping_threshold The maximum difference between two scans to be considered the same EIC. Default is Inf. #' @return A matrix with four columns: m/z value, retention time, intensity, and group number. #' @export remove_noise <- function(filename, @@ -71,7 +72,7 @@ remove_noise <- function(filename, intensity_weighted, do.plot, cache, - grouping_threshold = 0) { + grouping_threshold = Inf) { raw.data <- load_file(filename) raw.prof <- adaptive.bin( @@ -93,7 +94,7 @@ remove_noise <- function(filename, newprof <- newprof[newprof[, 4] %in% run.sel, ] - if (grouping_threshold > 0) { + if (grouping_threshold < Inf) { sorted_newprof <- newprof[order(newprof[,2]),] new_grps <- cumsum(c(0, diff(sorted_newprof[,2])) > grouping_threshold) sorted_newprof <- cbind(sorted_newprof, new_grps, deparse.level = 0) diff --git a/conda/environment-dev.yaml b/conda/environment-dev.yaml index ba6f375..521e95e 100644 --- a/conda/environment-dev.yaml +++ b/conda/environment-dev.yaml @@ -8,7 +8,7 @@ dependencies: - icu <=70.1 - r-mass - r-rgl - - bioconductor-mzR ==2.28.0 + - bioconductor-mzR ==2.36.0 - r-splines2 - r-doparallel - r-foreach diff --git a/tests/remote-files/input.txt b/tests/remote-files/input.txt index 4afb4bf..80f98a8 100644 --- a/tests/remote-files/input.txt +++ b/tests/remote-files/input.txt @@ -6,4 +6,5 @@ https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/ https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/input/RCX_08_shortened.mzML https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/input/single_eic.mzml https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/input/alg3.mzdata -https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/input/test_file.mzXML \ No newline at end of file +https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/input/test_file.mzXML +https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/input/Tribrid_201106_009-QC1_1_NEG_FISABIO_single_eic.raw.mzML \ No newline at end of file diff --git a/tests/testthat/test-remove_noise.R b/tests/testthat/test-remove_noise.R index 2691fec..7464fbc 100644 --- a/tests/testthat/test-remove_noise.R +++ b/tests/testthat/test-remove_noise.R @@ -74,3 +74,33 @@ patrick::with_parameters_test_that( ) ) ) + +test_that("remove noise works with grouping threshold", { + testdata <- file.path("..", "testdata") + input_path <- file.path(testdata, + "input", + "Tribrid_201106_009-QC1_1_NEG_FISABIO_single_eic.raw.mzML") + + expected <- tibble(group_number = c(1, 2, 3, 5, 6, 7, 8, 9), + n = c(67, 73, 3, 39, 2, 6, 3, 7)) + + sut <- remove_noise( + input_path, + min_pres = 0.8, + min_run = 0.2, + mz_tol = 5e-05, + baseline_correct = 0.0, + baseline_correct_noise_percentile = 0.05, + intensity_weighted = FALSE, + do.plot = FALSE, + cache = FALSE, + grouping_threshold = 4 + ) + + actual <- sut %>% + mutate(group = factor(group_number)) %>% + group_by(group_number) %>% + summarize(n = n()) + + expect_equal(actual, expected) +}) \ No newline at end of file From 53aac82fca3cf4f290380cf67a5452257f193b1a Mon Sep 17 00:00:00 2001 From: Helge Hecht Date: Tue, 9 Jul 2024 07:40:21 +0000 Subject: [PATCH 3/5] Update environment-dev.yaml --- conda/environment-dev.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/environment-dev.yaml b/conda/environment-dev.yaml index 521e95e..c125e8c 100644 --- a/conda/environment-dev.yaml +++ b/conda/environment-dev.yaml @@ -14,7 +14,7 @@ dependencies: - r-foreach - r-snow - r-rcpp - - r-arrow >=7.0.0,<10.0.0 + - r-arrow - r-dplyr - r-tidyr - r-stringr From 5b9cf8c20219ad714c9daf916dac68eb09ea3609 Mon Sep 17 00:00:00 2001 From: Helge Hecht Date: Tue, 9 Jul 2024 12:37:41 +0000 Subject: [PATCH 4/5] Update environment-dev.yaml --- conda/environment-dev.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/environment-dev.yaml b/conda/environment-dev.yaml index c125e8c..c903498 100644 --- a/conda/environment-dev.yaml +++ b/conda/environment-dev.yaml @@ -5,7 +5,7 @@ channels: - defaults dependencies: - r-base - - icu <=70.1 + - icu - r-mass - r-rgl - bioconductor-mzR ==2.36.0 From 24fe08e5fe66f5ec8ab06db2fe4057cbae0fcc5c Mon Sep 17 00:00:00 2001 From: hechth Date: Tue, 9 Jul 2024 15:34:59 +0200 Subject: [PATCH 5/5] added skipping test for mzdata and higher mzR versions. --- tests/testthat/test-load.lcms.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/testthat/test-load.lcms.R b/tests/testthat/test-load.lcms.R index dd55f6e..d9c56cc 100644 --- a/tests/testthat/test-load.lcms.R +++ b/tests/testthat/test-load.lcms.R @@ -10,6 +10,10 @@ create_test_case <- function(filename, mz_length, rt_length, intensities_length) patrick::with_parameters_test_that( "test load.lcms reads different file types", { + if(packageVersion("mzR") >= "2.29.0" && tools::file_ext(filename) == "mzdata") { + print("mzR >= 2.29.0 no longer supports mzdata.") + skip() + } # Arrange: Set up test inputs testdata <- file.path("..", "testdata") input_path <- file.path(testdata, "input", filename)