Merge pull request #201 from hechth/198_clustering

Implemented new version for clustering
RECETOX · May 3, 2023 · c81061b · c81061b
2 parents 0543377 + 9bda7dc
commit c81061b
Show file tree

Hide file tree

Showing 61 changed files with 1,252 additions and 160 deletions.
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -2,42 +2,40 @@
 // https://github.com/microsoft/vscode-dev-containers/tree/v0.217.1/containers/docker-existing-dockerfile
 {
 	"name": "recetox-aplcms-dev",
-
 	// Sets the run context to one level up instead of the .devcontainer folder.
-	"context": "..",
-
 	// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
-	"dockerFile": "../Dockerfile",
-
+	"image": "ubuntu:20.04",
 	"features": {
-		"git": {
+		"ghcr.io/devcontainers/features/git:1": {
 			"version": "latest",
-			"ppa": false,
+			"ppa": false
 		},
-		"common": {
-			"installZsh": "false",
-			"username": "false",
-		}
+		"ghcr.io/devcontainers/features/common-utils:2": {
+			"installZsh": "false"
+		},
+		"ghcr.io/devcontainers/features/github-cli:1": {},
+		"ghcr.io/devcontainers/features/conda:1": {}
 	},
-
 	// Add the IDs of extensions you want installed when the container is created.
-	"extensions": [
-		"reditorsupport.r",
-		"rdebugger.r-debugger",
-		"eamodio.gitlens",
-		"mutantdino.resourcemonitor",
-		"meakbiyik.vscode-r-test-adapter",
-		"dvirtz.parquet-viewer",
-		"github.vscode-pull-request-github",
-		"ms-vsliveshare.vsliveshare",
-		"tianyishi.rmarkdown"
-	],
-	"settings": {
-		"r.rterm.linux": "/bin/local/miniconda/envs/recetox-aplcms/bin/radian",
-		"r.rpath.linux": "/bin/local/miniconda/envs/recetox-aplcms/bin/R"
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"reditorsupport.r",
+				"rdebugger.r-debugger",
+				"eamodio.gitlens",
+				"mutantdino.resourcemonitor",
+				"meakbiyik.vscode-r-test-adapter",
+				"dvirtz.parquet-viewer",
+				"github.vscode-pull-request-github",
+				"ms-vsliveshare.vsliveshare",
+				"tianyishi.rmarkdown"
+			],
+			"settings": {
+				"r.rterm.linux": "/opt/conda/envs/recetox-aplcms-dev/bin/radian",
+				"r.rpath.linux": "/opt/conda/envs/recetox-aplcms-dev/bin/R"
+			}
+		}		
 	},
-
-	"onCreateCommand": "apt update && apt install -y locales && locale-gen en_US.UTF-8 && git config --global --add safe.directory /workspaces/recetox-aplcms",
-
+	"onCreateCommand": "apt update && apt install -y locales && locale-gen en_US.UTF-8 && apt-get update -y && apt-get install -y libxml2-dev && apt-get install -y libssl-dev && apt-get install -y libcurl4-openssl-dev && apt-get install -y libcgal-dev && apt-get install -y libglu1-mesa-dev && apt-get install -y wget && git config --global --add safe.directory /workspaces/recetox-aplcms && conda init && conda update -y conda && conda config --add channels conda-forge && conda config --add channels bioconda && conda config --set channel_priority strict && conda env create --file conda/environment-dev.yaml",
 	"postAttachCommand": "/bin/bash"
-}
+}
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [dev] - unreleased
 ### Added
+ - implemented new clustrering algorithm and included parallelism in unsupervised and hybrid [#201](https://github.com/RECETOX/recetox-aplcms/pull/201)
 ### Changed
 - refactored adaptive.bin and combine.seq.3 [#196](https://github.com/RECETOX/recetox-aplcms/pull/196)
 - refactored find.match [#193](https://github.com/RECETOX/recetox-aplcms/pull/193)

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,7 +1,5 @@
 # Generated by roxygen2: do not edit by hand
 
-S3method(solve,a)
-S3method(solve,sigma)
 export(adaptive.bin)
 export(adjust.time)
 export(aggregate_by_rt)
@@ -15,6 +13,10 @@ export(compute_breaks)
 export(compute_breaks_3)
 export(compute_chromatographic_profile)
 export(compute_clusters)
+export(compute_clusters_simple)
+export(compute_comb)
+export(compute_corrected_features)
+export(compute_curr_rec_with_enough_peaks)
 export(compute_delta_rt)
 export(compute_densities)
 export(compute_dx)
@@ -26,33 +28,43 @@ export(compute_mass_density)
 export(compute_mass_values)
 export(compute_mu_sc_std)
 export(compute_mz_sd)
+export(compute_peaks_and_valleys)
+export(compute_pks_vlys_rt)
+export(compute_rectangle)
 export(compute_rt_intervals_indices)
 export(compute_scale)
+export(compute_sel)
 export(compute_start_bound)
 export(compute_target_times)
 export(compute_template)
+export(compute_template_adjusted_rt)
 export(compute_uniq_grp)
 export(correct_time)
+export(count_peaks)
 export(create_aligned_feature_table)
 export(create_output)
 export(create_rows)
 export(draw_rt_correction_plot)
 export(draw_rt_normal_peaks)
 export(duplicate.row.remove)
+export(fill_missing_values)
 export(filter_based_on_density)
-export(find.tol)
 export(find.turn.point)
 export(find_local_maxima)
 export(find_mz_match)
+export(find_mz_tolerance)
 export(find_optima)
 export(get_custom_rt_tol)
+export(get_features_in_rt_range)
 export(get_mzrange_bound_indices)
 export(get_num_workers)
 export(get_rt_region_indices)
+export(get_single_occurrence_mask)
 export(get_times_to_use)
 export(hybrid)
 export(increment_counter)
 export(interpol.area)
+export(l2normalize)
 export(label_val_to_keep)
 export(load.lcms)
 export(load_aligned_features)
@@ -68,18 +80,21 @@ export(plot_raw_profile_histogram)
 export(plot_rt_profile)
 export(predict_mz_break_indices)
 export(predict_smoothed_rt)
-export(prep.uv)
+export(prep_uv)
 export(preprocess_bandwidth)
 export(preprocess_profile)
 export(prof.to.features)
 export(recover.weaker)
+export(refine_selection)
 export(remove_noise)
 export(rev_cum_sum)
 export(rm.ridge)
 export(run_filter)
 export(select_mz)
 export(select_rt)
 export(semi.sup)
+export(solve_a)
+export(solve_sigma)
 export(sort_data)
 export(span)
 export(two.step.hybrid)

diff --git a/R/adjust.time.R b/R/adjust.time.R
@@ -2,6 +2,7 @@
 NULL
 #> NULL
 
+#' @export
 compute_comb <- function(template_features, features) {
   combined <- dplyr::bind_rows(
     template_features,
@@ -11,6 +12,7 @@ compute_comb <- function(template_features, features) {
   return(combined)
 }
 
+#' @export
 compute_sel <- function(combined, mz_tol_relative, rt_tol_relative) {
   l <- nrow(combined)
   sel <- which(combined$mz[2:l] - combined$mz[1:(l - 1)] <
@@ -20,6 +22,7 @@ compute_sel <- function(combined, mz_tol_relative, rt_tol_relative) {
   return(sel)
 }
 
+#' @export
 compute_template_adjusted_rt <- function(combined, sel, j) {
   all_features <- cbind(combined$rt[sel], combined$rt[sel + 1])
   flip_indices <- which(combined$sample_id[sel] == j)
@@ -34,6 +37,7 @@ compute_template_adjusted_rt <- function(combined, sel, j) {
   return(all_features)
 }
 
+#' @export
 compute_corrected_features <- function(features, delta_rt, avg_time) {
   features <- features[order(features$rt, features$mz), ]
   corrected <- features$rt
@@ -58,6 +62,7 @@ compute_corrected_features <- function(features, delta_rt, avg_time) {
   return(features)
 }
 
+#' @export
 fill_missing_values <- function(orig.feature, this.feature) {
   missing_values <- which(is.na(this.feature$rt))
   for (i in missing_values) {

diff --git a/R/compute_clusters.R b/R/compute_clusters.R
@@ -2,21 +2,21 @@
 #' @description
 #' Sort tibble based on sample_names
 #' @export
-sort_data <- function(sample_names, feature_tables){
+sort_data <- function(sample_names, feature_tables) {
   index <- c()
   for (i in seq_along(sample_names))
   {
-    index <-  append(index, feature_tables[[i]]$sample_id[1])
+    index <- append(index, feature_tables[[i]]$sample_id[1])
   }
-  
+
   index <- match(sample_names, index)
   feature_tables <- feature_tables[index]
 
   return(feature_tables)
 }
 
 #' Compute clusters of mz and rt and assign cluster id to individual features.
-#' 
+#'
 #' @description
 #' Uses tolerances to group features with mz and rt within the tolerance into clusters,
 #' creating larger features from raw data points. Custom tolerances for mz and rt are
@@ -34,7 +34,7 @@ sort_data <- function(sample_names, feature_tables){
 #'   \item feature_tables - list - Feature tables with added columns [sample_id, cluster].
 #'   \item rt_tol_relative - float - Newly determined relative rt tolerance.
 #'   \item mz_tol_relative - float - Newly determined relative mz tolerance.
-#'}
+#' }
 #' @export
 compute_clusters <- function(feature_tables,
                              mz_tol_relative,
@@ -45,9 +45,9 @@ compute_clusters <- function(feature_tables,
                              sample_names = NA) {
   number_of_samples <- length(feature_tables)
   all <- concatenate_feature_tables(feature_tables, sample_names)
-  
+
   if (is.na(mz_tol_relative)) {
-    mz_tol_relative <- find.tol(
+    mz_tol_relative <- find_mz_tolerance(
       all$mz,
       mz_max_diff = mz_max_diff,
       aver.bin.size = 4000,
@@ -74,37 +74,104 @@ compute_clusters <- function(feature_tables,
     )
   }
 
-  res <- find.tol.time(
-    all,
-    number_of_samples = number_of_samples,
-    mz_tol_relative = mz_tol_relative,
-    rt_tol_relative = rt_tol_relative,
-    aver.bin.size = 200,
-    min.bins = 50,
-    max.bins = 100,
-    mz_tol_absolute = mz_tol_absolute,
-    max.num.segments = 10000,
-    do.plot = do.plot
+  aver.bin.size <- 200
+  min.bins <- 50
+  max.bins <- 100
+  max.num.segments <- 10000
+
+  features <- dplyr::arrange_at(all, "mz")
+  min_mz_tol <- compute_min_mz_tolerance(
+    features$mz,
+    mz_tol_relative,
+    mz_tol_absolute
   )
 
-  rt_tol_relative <- res$rt.tol
+  mz_breaks <- compute_breaks_3(features$mz, min_mz_tol)
+  features$mz_group <- 0
+
+  for (i in 2:length(mz_breaks)) {
+    subset_indices <- (mz_breaks[i - 1] + 1):mz_breaks[i]
+    features$mz_group[subset_indices] <- i
+  }
+
+  features <- features |> dplyr::arrange_at(c("mz_group", "rt"))
+
+  mz_breaks <- mz_breaks[c(-1, -length(mz_breaks))]
+
+  if (is.na(rt_tol_relative)) {
+    rt_tol_relative <- compute_rt_tol_relative(
+      mz_breaks,
+      max.num.segments,
+      aver.bin.size,
+      number_of_samples,
+      features$rt,
+      min.bins,
+      max.bins
+    )
+  }
+
+  # compute breaks in rt domain
+  rt_diffs <- diff(features$rt)
+  rt_breaks <- which(rt_diffs > rt_tol_relative)
+
+  # combine indices of all breaks in array and sort
+  all.breaks <- c(0, unique(c(mz_breaks, rt_breaks)), nrow(features))
+  all.breaks <- all.breaks[order(all.breaks)]
+
+  features$cluster <- 0
+  for (i in 2:length(all.breaks)) {
+    features$cluster[(all.breaks[i - 1] + 1):all.breaks[i]] <- i
+  }
 
-  message("**** performing time correction ****")
   message(paste("m/z tolerance level: ", mz_tol_relative))
   message(paste("time tolerance level:", rt_tol_relative))
 
-  # Select features from individual samples, sort by mz and rt and 
+  # Select features from individual samples, sort by mz and rt and
   # return the sorted tables as individual tibbles.
-  feature_tables <- res$features |>
+  feature_tables <- features |>
+    dplyr::select(-mz_group) |>
     dplyr::group_by(sample_id) |>
     dplyr::arrange_at(c("mz", "rt")) |>
     dplyr::group_split()
-  
+
   feature_tables <- sort_data(sample_names, feature_tables)
-  
+
   return(list(feature_tables = feature_tables, rt_tol_relative = rt_tol_relative, mz_tol_relative = mz_tol_relative))
 }
 
+#' Compute clusters using simple grouping based on numeric thresholds.
+#' 
+#' @describtion
+#' Features are first grouped in mz dimension based on the tolerance.
+#' First, the absolute tolerance is computed for each feature, then a new group is started
+#' once the difference between consecutive features is above this threshold.
+#' The same process is then repeated for the retention time dimension.
+#' The individual indices are then combined into a single index in the `cluster` columns.
+#' @param feature_tables list of tibbles feature tables coming from all samples.
+#' @param sample_names list of strings Sample names of the feature tables used to distinguish the samples.
+#' @param mz_tol_ppm float Relative tolerance for mz grouping in parts per million.
+#' @param rt_tol float Tolerance in retention time dimension [seconds].
+#' @return list of tibbles Feature tables passed initially with additional columns indicating the 
+#' mz and rt groups as well as the combined cluster index.
+#' @export
+compute_clusters_simple <- function(feature_tables, sample_names, mz_tol_ppm, rt_tol) {
+  all <- concatenate_feature_tables(feature_tables, sample_names) |> dplyr::arrange_at("mz")
+
+  mz_tol_rel <- mz_tol_ppm * 1e-06
+  mz_tol_abs <- all$mz * mz_tol_rel
+
+  all |>
+    dplyr::mutate(mz_group = cumsum(c(0, diff(mz)) > mz_tol_abs)) |>
+    dplyr::group_by(mz_group) |>
+    dplyr::arrange_at("rt") |>
+    dplyr::mutate(rt_group = cumsum(c(0, diff(rt)) > rt_tol)) |>
+    dplyr::group_by(mz_group, rt_group) |>
+    dplyr::mutate(cluster = cur_group_id()) |>
+    dplyr::ungroup() |>
+    dplyr::arrange_at("cluster") |>
+    dplyr::group_by(sample_id) |>
+    dplyr::group_split()
+}
 
 
 # compute_clusters_v2 <- function(feature_tables, mz_tol_ppm, rt_tol) {

diff --git a/R/feature.align.R b/R/feature.align.R
@@ -134,11 +134,13 @@ create_aligned_feature_table <- function(features_table,
     if (!is(cluster, "cluster")) {
         cluster <- parallel::makeCluster(cluster)
         on.exit(parallel::stopCluster(cluster))
+
+        # NOTE: side effect (doParallel has no functionality to clean up)
+        doParallel::registerDoParallel(cluster)
+        register_functions_to_cluster(cluster)
     }
 
-    # NOTE: side effect (doParallel has no functionality to clean up)
-    doParallel::registerDoParallel(cluster)
-    register_functions_to_cluster(cluster)
+
 
     number_of_samples <- length(sample_names)
     metadata_colnames <- c("id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", sample_names)