Merge pull request #164 from xtrojak/reorganise-util-functions

Cleanup utility functions
RECETOX · Nov 28, 2022 · 9e9fc64 · 9e9fc64
2 parents 5e1e986 + 6306301
commit 9e9fc64
Show file tree

Hide file tree

Showing 13 changed files with 147 additions and 392 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -23,9 +23,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - use proper sample IDs inside feature tables [#153](https://github.com/RECETOX/recetox-aplcms/pull/153)
 - exported functions in NAMESPACE [#154](https://github.com/RECETOX/recetox-aplcms/pull/154)
 - docstrings and documentation files for refactored functions [#160](https://github.com/RECETOX/recetox-aplcms/pull/160)
+- moved some utility functions to a more suitable locations [#164](https://github.com/RECETOX/recetox-aplcms/pull/160)
 ### Removed
 - `extract_features` and `feature.align` [#154](https://github.com/RECETOX/recetox-aplcms/pull/154)
 - improper usage of `@examples` [#160](https://github.com/RECETOX/recetox-aplcms/pull/160)
+- several obsolete utility functions [#164](https://github.com/RECETOX/recetox-aplcms/pull/160)
 
 ## [0.9.4] - 2022-05-10
 

diff --git a/NAMESPACE b/NAMESPACE
@@ -35,7 +35,6 @@ export(create_aligned_feature_table)
 export(draw_rt_correction_plot)
 export(draw_rt_normal_peaks)
 export(duplicate.row.remove)
-export(extract_pattern_colnames)
 export(find.tol)
 export(find.turn.point)
 export(find_local_maxima)
@@ -53,15 +52,13 @@ export(load.lcms)
 export(load_aligned_features)
 export(load_data)
 export(load_file)
-export(make.known.table)
 export(merge_features_and_known_table)
 export(msExtrema)
 export(normix)
 export(normix.bic)
 export(plot_normix_bic)
 export(plot_peak_summary)
 export(plot_raw_profile_histogram)
-export(plot_rt_histograms)
 export(plot_rt_profile)
 export(predict_mz_break_indices)
 export(predict_smoothed_rt)
@@ -75,7 +72,6 @@ export(rev_cum_sum)
 export(rm.ridge)
 export(run_filter)
 export(semi.sup)
-export(sort_samples_by_acquisition_number)
 export(span)
 export(two.step.hybrid)
 export(unsupervised)

diff --git a/R/hybrid.R b/R/hybrid.R
@@ -2,6 +2,30 @@
 NULL
 #> NULL
 
+#' Compute matches between mz array and specific mass value with a tolerance.
+#' @param sample_mz The mz array for which to compute the matching.
+#' @param known_mz The mz value with which to match.
+#' @param match_tol_ppm Matching tolerance in ppm.
+#' @return Indicies of m/z values within the tolerance of any known m/z.
+#' @export
+#' @examples
+#' find_mz_match(
+#'  sample_mz = c(10, 20, 21),
+#'  known_mz = 20
+#' )
+find_mz_match <- function(sample_mz, known_mz, match_tol_ppm = 5) {
+  matched_mz_idx <- rep(0, length(sample_mz))
+  match_tol_ppm <- match_tol_ppm / 1e6
+
+  for (i in seq_along(sample_mz)) {
+    rel_diff <- abs((sample_mz[i] - known_mz) / sample_mz[i])
+    if (min(rel_diff) < match_tol_ppm) {
+      matched_mz_idx[i] <- 1
+    }
+  }
+  return(which(matched_mz_idx == 1))
+}
+
 #' Match peaks from sample table to already known peaks via similar m/z and rt.
 #' @param aligned A list object with three tibble tables: metadata, intensity, and rt.
 #' @param known_table A table of known/previously detected peaks.

diff --git a/R/make.known.table.R b/R/make.known.table.R
diff --git a/R/mass.match.R b/R/mass.match.R
diff --git a/R/plot.R b/R/plot.R
@@ -145,16 +145,3 @@ draw_rt_correction_plot <- function(colors,
     )
   }
 }
-
-#' @export
-plot_rt_histograms <- function(pk.times,
-                               mz_sd) {
-    hist(mz_sd,
-        xlab = "m/z SD", ylab = "Frequency",
-        main = "m/z SD distribution"
-    )
-    hist(apply(pk.times[, -1:-4], 1, sd, na.rm = TRUE),
-        xlab = "Retention time SD", ylab = "Frequency",
-        main = "Retention time SD distribution"
-    )
-}
diff --git a/R/two.step.hybrid.R b/R/two.step.hybrid.R
@@ -2,6 +2,123 @@
 NULL
 #> NULL
 
+#' @importFrom dplyr select inner_join
+as_feature_crosstab <- function(sample_names, metadata, data) {
+  metadata_cols <- c('id', 'mz', 'rt', 'mzmin', 'mzmax')
+  data <- select(metadata, metadata_cols) |>
+    inner_join(data, on='id')
+  colnames(data) <- c(metadata_cols, sample_names)
+
+  return(data)
+}
+
+recover_weaker_signals <- function(
+  cluster,
+  filenames,
+  extracted_features,
+  corrected_features,
+  aligned_rt_crosstab,
+  aligned_int_crosstab,
+  original_mz_tolerance,
+  aligned_mz_tolerance,
+  aligned_rt_tolerance,
+  recover_mz_range,
+  recover_rt_range,
+  use_observed_range,
+  min_bandwidth,
+  max_bandwidth,
+  recover_min_count
+) {
+  snow::clusterExport(cluster, c('recover.weaker'))
+  snow::clusterEvalQ(cluster, library("splines"))
+
+  recovered <- lapply(seq_along(filenames), function(i) {
+    recover.weaker(
+      sample_name = get_sample_name(filenames[i]),
+      filename = filenames[[i]],
+      extracted_features = as_tibble(extracted_features[[i]]),
+      adjusted_features = as_tibble(corrected_features[[i]]),
+      pk.times = aligned_rt_crosstab,
+      aligned.ftrs = aligned_int_crosstab,
+      orig.tol = original_mz_tolerance,
+      align.mz.tol = aligned_mz_tolerance,
+      align.rt.tol = aligned_rt_tolerance,
+      recover_mz_range = recover_mz_range,
+      recover_rt_range = recover_rt_range,
+      use.observed.range = use_observed_range,
+      bandwidth = 0.5,
+      min.bw = min_bandwidth,
+      max.bw = max_bandwidth,
+      recover.min.count = recover_min_count
+    )
+  })
+
+  feature_table <- aligned_rt_crosstab[, 1:4]
+  rt_crosstab <- cbind(feature_table, sapply(recovered, function(x) x$this.times))
+  int_crosstab <- cbind(feature_table, sapply(recovered, function(x) x$this.ftrs))
+
+  feature_names <- rownames(feature_table)
+  sample_names <- colnames(aligned_rt_crosstab[, -(1:4)])
+
+  list(
+    extracted_features = lapply(recovered, function(x) x$this.f1),
+    corrected_features = lapply(recovered, function(x) x$this.f2),
+    rt_crosstab = as_feature_crosstab(feature_names, sample_names, rt_crosstab),
+    int_crosstab = as_feature_crosstab(feature_names, sample_names, int_crosstab)
+  )
+}
+
+pivot_feature_values <- function(feature_table, variable) {
+  extended_variable <- paste0("sample_", variable)
+  values <- dplyr::select(feature_table, mz, rt, sample, !!sym(extended_variable))
+  values <- tidyr::pivot_wider(values, names_from = sample, values_from = !!sym(extended_variable))
+  variable_colnames <- colnames(values)[3:ncol(values)]
+  variable_colnames <- paste0(variable_colnames, "_", variable)
+  colnames(values)[3:ncol(values)] <- variable_colnames
+  return(values)
+}
+
+long_to_wide_feature_table <- function(feature_table) {
+  sample_rts <- pivot_feature_values(feature_table, "rt")
+  sample_intensities <- pivot_feature_values(feature_table, "intensity")
+  feature_table <- dplyr::select(feature_table, mz, rt) %>%
+    dplyr::distinct(mz, rt) %>%
+    dplyr::inner_join(sample_rts, by = c("mz", "rt")) %>%
+    dplyr::inner_join(sample_intensities, by = c("mz", "rt"))
+}
+
+wide_to_long_feature_table <- function(wide_table, sample_names) {
+  wide_table <- tibble::rowid_to_column(wide_table, "feature")
+
+  long_rt <- tidyr::gather(wide_table, sample, sample_rt, contains("_rt"), factor_key=FALSE) %>%
+    dplyr::select(-contains("_intensity")) %>%
+    mutate(sample = stringr::str_remove_all(sample, "_rt"))
+  long_int <- tidyr::gather(wide_table, sample, sample_intensity, contains("_intensity"), factor_key=FALSE) %>%
+    dplyr::select(-contains("_rt")) %>%
+    mutate(sample = stringr::str_remove_all(sample, "_intensity"))
+
+  long_features <- dplyr::full_join(long_rt, long_int, by = c("feature", "mz", "rt", "mz_min", "mz_max", "sample"))
+
+  return(long_features)
+}
+
+extract_pattern_colnames <- function(dataframe, pattern) {
+  dataframe <- dplyr::select(dataframe, contains(pattern))
+  return(colnames(dataframe))
+}
+
+as_wide_aligned_table <- function(aligned) {
+  mz_scale_table <- aligned$rt_crosstab[, c("mz", "rt", "mz_min", "mz_max")]
+  aligned <- as_feature_sample_table(
+    rt_crosstab = aligned$rt_crosstab,
+    int_crosstab = aligned$int_crosstab
+  )
+  aligned <- long_to_wide_feature_table(aligned)
+  aligned <- dplyr::inner_join(aligned, mz_scale_table, by = c("mz", "rt"))
+  return(aligned)
+}
+
+
 merge_known_tables <- function(batchwise, batches_idx) {
   colnames <- c("chemical_formula", "HMDB_ID", "KEGG_compound_ID", "mass", "ion.type", "m.z",
               "Number_profiles_processed", "Percent_found", "mz_min", "mz_max", 
@@ -256,6 +373,8 @@ semisup_to_hybrid_adapter <- function(batchwise, batches_idx) {
 #' Two step hybrid feature detection.
 #' 
 #' A two-stage hybrid feature detection and alignment procedure, for data generated in multiple batches.
+#' NOTE: This function is OBSOLETE and should no longer be used,
+#' since it is no longer maintained and will soon be removed.
 #' 
 #' @param filenames file names
 #' @param metadata the batch label of each file.