Merge pull request #153 from xtrojak/clean_up_filenames

Use proper sample IDs inside feature tables
RECETOX · Oct 25, 2022 · 5ac72f5 · 5ac72f5
2 parents 6029737 + 8edd88d
commit 5ac72f5
Show file tree

Hide file tree

Showing 14 changed files with 149 additions and 130 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - refactored `find.tol.time.R` [#91](https://github.com/RECETOX/recetox-aplcms/pull/91)
 - refactored `find.turn.point.R` [#91](https://github.com/RECETOX/recetox-aplcms/pull/91)
 - refactored `proc.cdf.R` and `adaptive.bin.R` [#137](https://github.com/RECETOX/recetox-aplcms/pull/137)
+- use proper sample IDs inside feature tables [#153](https://github.com/RECETOX/recetox-aplcms/pull/153)
 ### Removed
 
 ## [0.9.4] - 2022-05-10

diff --git a/R/adjust.time.R b/R/adjust.time.R
@@ -29,12 +29,7 @@ compute_template_adjusted_rt <- function(combined, sel, j) {
 
   # now the first column is the template retention time.
   # the second column is the to-be-adjusted retention time
-
-  cat(c("sample", j, "using", nrow(all_features), ", "))
-  if (j %% 3 == 0) {
-    cat("\n")
-  }
-
+
   all_features <- all_features[order(all_features[, 2]), ]
   return(all_features)
 }
@@ -77,10 +72,11 @@ fill_missing_values <- function(orig.feature, this.feature) {
 
 compute_template <- function(extracted_features) {
   num.ftrs <- sapply(extracted_features, nrow)
-  template <- which.max(num.ftrs)
+  template_id <- which.max(num.ftrs)
+  template <- extracted_features[[template_id]]$sample_id[1]
   message(paste("the template is sample", template))
 
-  candi <- tibble::as_tibble(extracted_features[[template]]) |> dplyr::select(c(mz, rt))
+  candi <- tibble::as_tibble(extracted_features[[template_id]]) |> dplyr::select(c(mz, rt))
   template_features <- dplyr::bind_cols(candi, sample_id = rep(template, nrow(candi)))
   return(tibble::as_tibble(template_features))
 }

diff --git a/R/compute_clusters.R b/R/compute_clusters.R
@@ -12,6 +12,7 @@
 #' @param mz_max_diff float Maximum difference between featuure mz values to belong to the same cluster.
 #' @param rt_tol_relative float Relative retention time tolerance to use for grouping features.
 #' @param do.plot bool Plot graphics or not.
+#' @param sample_names list List of sample names.
 #' @return Returns a list with following items:
 #' \itemize{
 #'   \item feature_tables - list - Feature tables with added columns [sample_id, cluster].
@@ -23,10 +24,11 @@ compute_clusters <- function(feature_tables,
                              mz_tol_absolute,
                              mz_max_diff,
                              rt_tol_relative,
-                             do.plot = FALSE) {
+                             do.plot = FALSE,
+                             sample_names = NA) {
   number_of_samples <- length(feature_tables)
-  all <- concatenate_feature_tables(feature_tables)
-
+  all <- concatenate_feature_tables(feature_tables, sample_names)
+  
   if (is.na(mz_tol_relative)) {
     mz_tol_relative <- find.tol(
       all$mz,

diff --git a/R/feature.align.R b/R/feature.align.R
@@ -16,13 +16,14 @@ add_row <- function(df, data, i, column_names) {
 }
 
 
-create_output <- function(sample_grouped, number_of_samples) {
+create_output <- function(sample_grouped, sample_names) {
+    number_of_samples <- length(sample_names)
     intensity_row <- rep(0, number_of_samples)
     rt_row <- rep(0, number_of_samples)
     sample_presence <- rep(0, number_of_samples)
 
     for (i in seq_along(intensity_row)) {
-        filtered <- filter(sample_grouped, sample_id == i)
+        filtered <- filter(sample_grouped, sample_id == sample_names[i])
         if (nrow(filtered) != 0) {
             sample_presence[i] <- 1
             intensity_row[i] <- sum(filtered$area)
@@ -68,23 +69,23 @@ filter_based_on_density <- function(sample, turns, index, i) {
 }
 
 
-select_rt <- function(sample, rt_tol_relative, min_occurrence, number_of_samples) {
+select_rt <- function(sample, rt_tol_relative, min_occurrence, sample_names) {
     turns <- find_optima(sample$rt, bandwidth = rt_tol_relative / 1.414)
     for (i in seq_along(turns$peaks)) {
         sample_grouped <- filter_based_on_density(sample, turns, 2, i)
         if (validate_contents(sample_grouped, min_occurrence)) {
-            return(create_output(sample_grouped, number_of_samples))
+            return(create_output(sample_grouped, sample_names))
         }
     }
 }
 
 
-select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence, number_of_samples) {
+select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names) {
     turns <- find_optima(sample$mz, bandwidth = mz_tol_relative * median(sample$mz))
     for (i in seq_along(turns$peaks)) {
         sample_grouped <- filter_based_on_density(sample, turns, 1, i)
         if (validate_contents(sample_grouped, min_occurrence)) {
-            return(select_rt(sample_grouped, rt_tol_relative, min_occurrence, number_of_samples))
+            return(select_rt(sample_grouped, rt_tol_relative, min_occurrence, sample_names))
         }
     }
 }
@@ -96,32 +97,33 @@ create_rows <- function(features,
                         mz_tol_relative,
                         rt_tol_relative,
                         min_occurrence,
-                        number_of_samples) {
+                        sample_names) {
     if (i %% 100 == 0) {
         gc()
     } # call Garbage Collection for performance improvement?
-
+    
     sample <- dplyr::filter(features, cluster == sel.labels[i])
     if (nrow(sample) > 1) {
         if (validate_contents(sample, min_occurrence)) {
-            return(select_mz(sample, mz_tol_relative, rt_tol_relative, min_occurrence, number_of_samples))
+            return(select_mz(sample, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names))
         }
     } else if (min_occurrence == 1) {
-        return(create_output(sample_grouped, number_of_samples))
+        return(create_output(sample_grouped, sample_names))
     }
     return(NULL)
 }
 
 
 create_aligned_feature_table <- function(all_table,
                                          min_occurrence,
-                                         number_of_samples,
+                                         sample_names,
                                          rt_tol_relative,
                                          mz_tol_relative) {
 
-    metadata_colnames <- c("id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", paste0("sample_", 1:number_of_samples))
-    intensity_colnames <- c("id", paste0("sample_", 1:number_of_samples, "_intensity"))
-    rt_colnames <- c("id", paste0("sample_", 1:number_of_samples, "_rt"))
+    number_of_samples <- length(sample_names)
+    metadata_colnames <- c("id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", sample_names)
+    intensity_colnames <- c("id", paste0(sample_names, "_intensity"))
+    rt_colnames <- c("id", paste0(sample_names, "_rt"))
 
     aligned_features <- create_empty_tibble(number_of_samples, metadata_colnames, intensity_colnames, rt_colnames)
 
@@ -139,7 +141,7 @@ create_aligned_feature_table <- function(all_table,
             mz_tol_relative,
             rt_tol_relative,
             min_occurrence,
-            number_of_samples
+            sample_names
         )
 
         if (!is.null(rows)) {
@@ -168,6 +170,7 @@ create_aligned_feature_table <- function(all_table,
 #'  when the m/z range is wide. This parameter limits the tolerance in absolute terms. It mostly
 #'  influences feature matching in higher m/z range.
 #' @param do.plot Indicates whether plot should be drawn.
+#' @param sample_names list List of sample names.
 #' @return Returns a list object with the following objects in it:
 #' \itemize{
 #'   \item aligned.ftrs - A matrix, with columns of m/z values, elution times, signal strengths in each spectrum.
@@ -178,37 +181,39 @@ create_aligned_feature_table <- function(all_table,
 #' @export
 #' @examples
 #' data(extracted)
-#' feature.align(extracted, mz_max_diff = 10 * 1e-05, do.plot = FALSE)
+#' feature.align(extracted, mz_max_diff = 10 * 1e-05, do.plot = FALSE, sample_names = c("s1", "s2", "s3"))
 feature.align <- function(features,
                           min_occurrence = 2,
                           mz_tol_relative = NA,
                           rt_tol_relative = NA,
                           mz_max_diff = 1e-4,
                           mz_tol_absolute = 0.01,
-                          do.plot = TRUE) {
+                          do.plot = TRUE,
+                          sample_names = NA) {
     if (do.plot) {
         par(mfrow = c(3, 2))
         draw_plot(label = "Feature alignment", cex = 2)
         draw_plot()
     }
-
-
+
     number_of_samples <- length(features)
     if (number_of_samples > 1) {
         res <- compute_clusters(
             features,
             mz_tol_relative,
             mz_tol_absolute,
             mz_max_diff,
-            rt_tol_relative
+            rt_tol_relative,
+            do.plot,
+            sample_names
         )
 
         all_table <- dplyr::bind_rows(res$feature_tables)
 
         aligned_features <- create_aligned_feature_table(
             all_table,
             min_occurrence,
-            number_of_samples,
+            sample_names,
             res$rt_tol_relative,
             res$mz_tol_relative
         )

diff --git a/R/hybrid.R b/R/hybrid.R
@@ -290,7 +290,8 @@ hybrid <- function(
     mz_tol_relative = align_mz_tol,
     mz_tol_absolute = max_align_mz_diff,
     mz_max_diff = 10 * mz_tol,
-    rt_tol_relative = align_rt_tol
+    rt_tol_relative = align_rt_tol,
+    sample_names = sample_names
   )
 
   message("**** computing template ****")
@@ -318,7 +319,7 @@ hybrid <- function(
   aligned <- create_aligned_feature_table(
       dplyr::bind_rows(adjusted_clusters$feature_tables),
       min_exp,
-      number_of_samples,
+      sample_names,
       adjusted_clusters$rt_tol_relative,
       adjusted_clusters$mz_tol_relative
   )
@@ -392,7 +393,7 @@ hybrid <- function(
   recovered_aligned <- create_aligned_feature_table(
       dplyr::bind_rows(adjusted_clusters$feature_tables),
       min_exp,
-      number_of_samples,
+      sample_names,
       adjusted_clusters$rt_tol_relative,
       adjusted_clusters$mz_tol_relative
   )

diff --git a/R/recover.weaker.R b/R/recover.weaker.R
@@ -669,7 +669,6 @@ recover.weaker <- function(filename,
                            bandwidth = .5,
                            recover.min.count = 3,
                            intensity.weighted = FALSE) {
-
   # load raw data
   data_table <- load_file(filename) |> dplyr::arrange_at("mz")
   times <- sort(unique(data_table$rt))
@@ -778,7 +777,7 @@ recover.weaker <- function(filename,
           mz = this.rec$mz[this.sel],
           rt = this.rec$rt[this.sel] + this.time.adjust,
           area = this.rec$intensities[this.sel],
-          sample_id = grep(sample_name, colnames(metadata_table)) - 8 # offset for other columns `mz`, `rt` etc
+          sample_id = sample_name
         )
       }
     }

diff --git a/R/semi.sup.R b/R/semi.sup.R
@@ -105,7 +105,8 @@ semi.sup <- function(
     match.tol.ppm=NA,
     new.feature.min.count=2,
     recover.min.count=3,
-    intensity.weighted=FALSE)
+    intensity.weighted=FALSE,
+    sample_names = NA)
 {
     setwd(folder)
     files<-files[order(files)]
@@ -243,7 +244,8 @@ semi.sup <- function(
               mz_tol_relative = align.mz.tol,
               rt_tol_relative = align.rt.tol,
               mz_max_diff = 10 * mz.tol,
-              mz_tol_absolute = max.align.mz.diff
+              mz_tol_absolute = max.align.mz.diff,
+              sample_names = sample_names
             )
         )
 
@@ -456,7 +458,8 @@ semi.sup <- function(
               mz_tol_relative = align.mz.tol,
               rt_tol_relative = align.rt.tol,
               mz_max_diff = 10 * mz.tol,
-              mz_tol_absolute = max.align.mz.diff
+              mz_tol_absolute = max.align.mz.diff,
+              sample_names = sample_names
             )
         )
 

diff --git a/R/two.step.hybrid.R b/R/two.step.hybrid.R
@@ -363,6 +363,8 @@ two.step.hybrid <- function(filenames,
   batches_idx <- unique(metadata$batch)
   batchwise <- new("list")
   message("* processing ", length(batches_idx), " batches separately")
+
+  sample_names <- get_sample_name(filenames)
 
   for (batch.i in batches_idx) {
     files_batch <- dplyr::filter(filenames_batchwise, batch == batch.i)$filename
@@ -391,7 +393,8 @@ two.step.hybrid <- function(filenames,
       use.observed.range = use.observed.range,
       shape.model = shape.model,
       new.feature.min.count = new.feature.min.count,
-      recover.min.count = recover.min.count
+      recover.min.count = recover.min.count,
+      sample_names = sample_names
     )
 
     features$final.ftrs <- features$final.ftrs[order(features$final.ftrs[, 1], features$final.ftrs[, 2]), ]

diff --git a/R/unsupervised.R b/R/unsupervised.R
@@ -58,6 +58,7 @@ sort_samples_by_acquisition_number <- function (filenames) {
 }
 
 align_features <- function(sample_names, ...) {
+  # if this will be used in Galaxy wrapper, needs to be fixed (pass also sample_names)
   aligned <- feature.align(...)
 
   list(
@@ -242,7 +243,8 @@ unsupervised <- function(
     mz_tol_relative = align_mz_tol,
     mz_tol_absolute = max_align_mz_diff,
     mz_max_diff = 10 * mz_tol,
-    rt_tol_relative = align_rt_tol
+    rt_tol_relative = align_rt_tol,
+    sample_names = sample_names
   )
 
   message("**** computing template ****")
@@ -270,7 +272,7 @@ unsupervised <- function(
   aligned <- create_aligned_feature_table(
       dplyr::bind_rows(adjusted_clusters$feature_tables),
       min_exp,
-      number_of_samples,
+      sample_names,
       adjusted_clusters$rt_tol_relative,
       adjusted_clusters$mz_tol_relative
   )
@@ -314,7 +316,7 @@ unsupervised <- function(
   recovered_aligned <- create_aligned_feature_table(
       dplyr::bind_rows(recovered_clusters$feature_tables),
       min_exp,
-      number_of_samples,
+      sample_names,
       recovered_clusters$rt_tol_relative,
       recovered_clusters$mz_tol_relative
   )