Merge pull request #177 from zargham-ahmad/issue_55

Full test data test for unsupervised and hybrid
RECETOX · Jan 26, 2023 · 3cff037 · 3cff037
2 parents aa11247 + 8158221
commit 3cff037
Show file tree

Hide file tree

Showing 15 changed files with 86 additions and 29 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,9 +9,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - refactored `rm.ridge.R` [#171](https://github.com/RECETOX/recetox-aplcms/pull/171)
 - refactored and documented `prof.to.features.R` [#170](https://github.com/RECETOX/recetox-aplcms/pull/170)
+- added full testdata case for `unsupervised.R` and `hybrid.R` [#177](https://github.com/RECETOX/recetox-aplcms/pull/177)
+- added function to sort data in `compute_clusters.R` to return sorted data [#177](https://github.com/RECETOX/recetox-aplcms/pull/177)
 
 ### Changed
+- updated remote files with the full data get links [#177](https://github.com/RECETOX/recetox-aplcms/pull/177)
+- fixed parameter value of recover.weaker in `unsupervised.R` and `hybrid.R` [#177](https://github.com/RECETOX/recetox-aplcms/pull/177)
+
 ### Removed
+removed NA check in `concatenate_feature_tables` [#177](https://github.com/RECETOX/recetox-aplcms/pull/177)
 
 ## [0.10.0] - 2022-12-07
 

diff --git a/NAMESPACE b/NAMESPACE
@@ -72,6 +72,7 @@ export(rev_cum_sum)
 export(rm.ridge)
 export(run_filter)
 export(semi.sup)
+export(sort_data)
 export(span)
 export(two.step.hybrid)
 export(unsupervised)

diff --git a/R/compute_clusters.R b/R/compute_clusters.R
@@ -1,4 +1,20 @@
 
+#' @description
+#' Sort tibble based on sample_names
+#' @export
+sort_data <- function(sample_names, feature_tables){
+  index <- c()
+  for (i in seq_along(sample_names))
+  {
+    index <-  append(index, feature_tables[[i]]$sample_id[1])
+  }
+
+  index <- match(sample_names, index)
+  feature_tables <- feature_tables[index]
+
+  return(feature_tables)
+}
+
 #' Compute clusters of mz and rt and assign cluster id to individual features.
 #' 
 #' @description
@@ -83,6 +99,8 @@ compute_clusters <- function(feature_tables,
     dplyr::group_by(sample_id) |>
     dplyr::arrange_at(c("mz", "rt")) |>
     dplyr::group_split()
-
+
+  feature_tables <- sort_data(sample_names, feature_tables)
+
   return(list(feature_tables = feature_tables, rt_tol_relative = rt_tol_relative, mz_tol_relative = mz_tol_relative))
 }
diff --git a/R/hybrid.R b/R/hybrid.R
@@ -405,7 +405,8 @@ hybrid <- function(
     mz_tol_absolute = extracted_clusters$rt_tol_relative,
     mz_max_diff = 10 * mz_tol,
     rt_tol_relative = rt_tol_relative,
-    do.plot = do_plot
+    do.plot = do_plot,
+    sample_names = sample_names
   )
 
   message("**** feature alignment ****")
@@ -426,12 +427,12 @@ hybrid <- function(
       rt_tol_relative = adjusted_clusters$rt_tol_relative,
       from_features_to_known_table = FALSE
   )
-
+  
   message("**** weaker signal recovery ****")
   recovered <- lapply(seq_along(filenames), function(i) {
     recover.weaker(
       filename = filenames[[i]],
-      sample_name = as.character(i),
+      sample_name = sample_names[i],
       extracted_features = extracted[[i]],
       adjusted_features = corrected[[i]],
       metadata_table = merged$metadata,
@@ -460,9 +461,10 @@ hybrid <- function(
     mz_tol_absolute = mz_tol_absolute,
     mz_max_diff = 10 * mz_tol,
     rt_tol_relative = rt_tol_relative,
-    do.plot = do_plot
+    do.plot = do_plot,
+    sample_names = sample_names
   )
-
+  
   message("**** computing template ****")
   template_features <- compute_template(recovered_clusters$feature_tables)
 
@@ -482,9 +484,10 @@ hybrid <- function(
     mz_tol_absolute = recovered_clusters$rt_tol_relative,
     mz_max_diff = 10 * mz_tol,
     rt_tol_relative = rt_tol_relative,
-    do.plot = do_plot
+    do.plot = do_plot,
+    sample_names = sample_names
   )
-
+  
   message("**** second feature alignment ****")
   recovered_aligned <- create_aligned_feature_table(
       dplyr::bind_rows(adjusted_clusters$feature_tables),

diff --git a/R/recover.weaker.R b/R/recover.weaker.R
@@ -148,6 +148,7 @@ compute_target_times <- function(aligned_rts,
 
     aligned_rts[sel_non_na] <- predict(sp, aligned_rts[sel_non_na])$y
   }
+  return(aligned_rts)
 }
 
 #' Get boolean mask for values that occur only once.

diff --git a/R/unsupervised.R b/R/unsupervised.R
@@ -195,7 +195,8 @@ unsupervised <- function(
     mz_tol_absolute = extracted_clusters$rt_tol_relative,
     mz_max_diff = 10 * mz_tol,
     rt_tol_relative = rt_tol_relative,
-    do.plot = do_plot
+    do.plot = do_plot,
+    sample_names = sample_names
   )
 
   message("**** feature alignment ****")
@@ -211,7 +212,7 @@ unsupervised <- function(
   recovered <- lapply(seq_along(filenames), function(i) {
     recover.weaker(
       filename = filenames[[i]],
-      sample_name = as.character(i),
+      sample_name = sample_names[i],
       extracted_features = feature_tables[[i]],
       adjusted_features = corrected[[i]],
       metadata_table = aligned$metadata,
@@ -240,7 +241,8 @@ unsupervised <- function(
     mz_tol_absolute = adjusted_clusters$rt_tol_relative,
     mz_max_diff = 10 * mz_tol,
     rt_tol_relative = rt_tol_relative,
-    do.plot = do_plot
+    do.plot = do_plot,
+    sample_names = sample_names
   )
 
   message("**** feature alignment ****")

diff --git a/R/utils.R b/R/utils.R
@@ -53,11 +53,9 @@ register_functions_to_cluster <- function(cluster) {
 #' 
 #' @param features list List of tibbles containing extracted feature tables.
 concatenate_feature_tables <- function(features, sample_names) {
-    if(!all(is.na(sample_names))) {
-        for (i in seq_along(features)) {
-            if(!("sample_id" %in% colnames(features[[i]]))) {
-                features[[i]] <- tibble::add_column(features[[i]], sample_id = sample_names[i])
-            }
+    for (i in seq_along(features)) {
+        if(!("sample_id" %in% colnames(features[[i]]))) {
+            features[[i]] <- tibble::add_column(features[[i]], sample_id = sample_names[i])
         }
     }
 

diff --git a/tests/remote-files/hybrid.txt b/tests/remote-files/hybrid.txt
@@ -1,3 +1,4 @@
 https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/hybrid/RCX_shortened_recovered_feature_sample_table.parquet
 https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/hybrid/known_table.parquet
-https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/hybrid/mbr_recovered_feature_sample_table.parquet
+https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/hybrid/mbr_recovered_feature_sample_table.parquet
+https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/hybrid/qc_no_dil_milliq_recovered_feature_sample_table.parquet
diff --git a/tests/remote-files/unsupervised.txt b/tests/remote-files/unsupervised.txt
@@ -1,2 +1,3 @@
 https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/unsupervised/RCX_shortened_unsupervised.parquet
-https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/unsupervised/mbr_test_unsupervised.parquet
+https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/unsupervised/mbr_test_unsupervised.parquet
+https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/unsupervised/qc_no_dil_milliq_unsupervised.parquet
diff --git a/tests/remote-files/extracted-whole-input.txt → tests/remote-files/whole-data-input.txt b/tests/remote-files/extracted-whole-input.txt → tests/remote-files/whole-data-input.txt
diff --git a/tests/testthat/test-benchmark-extract_features.R b/tests/testthat/test-benchmark-extract_features.R
@@ -1,7 +1,7 @@
 patrick::with_parameters_test_that(
   "test benchmark",
   {
-    if (skip) {
+    if (skip_benchmark) {
       skip("Disabled")
     }
 
@@ -97,7 +97,7 @@ patrick::with_parameters_test_that(
       intensity_weighted = FALSE,
       sd_cut = c(0.01, 500),
       sigma_ratio_lim = c(0.01, 100),
-      skip = FALSE
+      skip_benchmark = FALSE
     )
   )
 )
diff --git a/tests/testthat/test-benchmark-unsupervised.R b/tests/testthat/test-benchmark-unsupervised.R
@@ -1,7 +1,7 @@
 patrick::with_parameters_test_that(
   "test benchmark",
   {
-    if (skip) {
+    if (skip_benchmark) {
       skip("Disabled")
     }
 
@@ -37,7 +37,7 @@ patrick::with_parameters_test_that(
   patrick::cases(
     mbr_test = list(
       filename = c("mbr_test0", "mbr_test1", "mbr_test2"),
-      skip = TRUE
+      skip_benchmark = TRUE
     )
   )
 )
diff --git a/tests/testthat/test-extract_features.R b/tests/testthat/test-extract_features.R
@@ -2,7 +2,7 @@ patrick::with_parameters_test_that(
   "extract single feature works",
   {
     skip_on_ci()
-    if (skip) {
+    if (full_testdata) {
       skip("skipping whole data test case")
     }
 
@@ -86,7 +86,7 @@ patrick::with_parameters_test_that(
       intensity_weighted = FALSE,
       sd_cut = c(0.01, 500),
       sigma_ratio_lim = c(0.01, 100),
-      skip = FALSE
+      full_testdata = FALSE
     ),
     qc_no_dil_milliq = list(
       files = c("8_qc_no_dil_milliq.mzml", "21_qc_no_dil_milliq.mzml", "29_qc_no_dil_milliq.mzml"),
@@ -97,7 +97,7 @@ patrick::with_parameters_test_that(
       intensity_weighted = FALSE,
       sd_cut = c(0.01, 500),
       sigma_ratio_lim = c(0.01, 100),
-      skip = TRUE
+      full_testdata = TRUE
     )
   )
 )
diff --git a/tests/testthat/test-hybrid.R b/tests/testthat/test-hybrid.R
@@ -1,6 +1,10 @@
 patrick::with_parameters_test_that("basic hybrid test", {
   if(ci_skip == TRUE) skip_on_ci()
 
+  if (full_testdata) {
+    skip("skipping whole data test case")
+  }
+
   store_reports <- FALSE
   testdata <- file.path("..", "testdata")
 
@@ -50,10 +54,17 @@ patrick::with_parameters_test_that("basic hybrid test", {
 patrick::cases(
   mbr = list(
     files = c("mbr_test0.mzml", "mbr_test1.mzml", "mbr_test2.mzml"),
-    ci_skip = TRUE
+    ci_skip = TRUE,
+    full_testdata = FALSE
   ),
   RCX_shortened = list(
     files = c("RCX_06_shortened.mzML", "RCX_07_shortened.mzML", "RCX_08_shortened.mzML"),
-    ci_skip = FALSE
+    ci_skip = FALSE,
+    full_testdata = FALSE
+  ),
+  qc_no_dil_milliq = list(
+    files = c("8_qc_no_dil_milliq.mzml", "21_qc_no_dil_milliq.mzml", "29_qc_no_dil_milliq.mzml"),
+    ci_skip = TRUE,
+    full_testdata = TRUE
   )
 ))
diff --git a/tests/testthat/test-unsupervised.R b/tests/testthat/test-unsupervised.R
@@ -2,6 +2,11 @@ patrick::with_parameters_test_that(
   "basic unsupervised test",
   {
     store_reports <- FALSE
+
+    if (full_testdata) {
+      skip("skipping whole data test case")
+    }
+
     test_files <- sapply(files, function(x) file.path("../testdata/input", x))
 
     expected <- arrow::read_parquet(file.path("../testdata/unsupervised", paste0(.test_name, "_unsupervised.parquet")))
@@ -30,7 +35,17 @@ patrick::with_parameters_test_that(
     expect_equal(actual, expected, tolerance = 0.01)
   },
   patrick::cases(
-    mbr_test = list(files = c("mbr_test0.mzml", "mbr_test1.mzml", "mbr_test2.mzml")),
-    RCX_shortened = list(files = c("RCX_06_shortened.mzML", "RCX_07_shortened.mzML", "RCX_08_shortened.mzML"))
+    mbr_test = list(
+      files = c("mbr_test0.mzml", "mbr_test1.mzml", "mbr_test2.mzml"),
+      full_testdata = FALSE
+    ),
+    RCX_shortened = list(
+      files = c("RCX_06_shortened.mzML", "RCX_07_shortened.mzML", "RCX_08_shortened.mzML"),
+      full_testdata = FALSE
+    ),
+    qc_no_dil_milliq = list(
+      files = c("8_qc_no_dil_milliq.mzml", "21_qc_no_dil_milliq.mzml", "29_qc_no_dil_milliq.mzml"),
+      full_testdata = TRUE
+    )
   )
 )