diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index e25854db..3f0a920c 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -2,42 +2,40 @@ // https://github.com/microsoft/vscode-dev-containers/tree/v0.217.1/containers/docker-existing-dockerfile { "name": "recetox-aplcms-dev", - // Sets the run context to one level up instead of the .devcontainer folder. - "context": "..", - // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. - "dockerFile": "../Dockerfile", - + "image": "ubuntu:20.04", "features": { - "git": { + "ghcr.io/devcontainers/features/git:1": { "version": "latest", - "ppa": false, + "ppa": false }, - "common": { - "installZsh": "false", - "username": "false", - } + "ghcr.io/devcontainers/features/common-utils:2": { + "installZsh": "false" + }, + "ghcr.io/devcontainers/features/github-cli:1": {}, + "ghcr.io/devcontainers/features/conda:1": {} }, - // Add the IDs of extensions you want installed when the container is created. - "extensions": [ - "reditorsupport.r", - "rdebugger.r-debugger", - "eamodio.gitlens", - "mutantdino.resourcemonitor", - "meakbiyik.vscode-r-test-adapter", - "dvirtz.parquet-viewer", - "github.vscode-pull-request-github", - "ms-vsliveshare.vsliveshare", - "tianyishi.rmarkdown" - ], - "settings": { - "r.rterm.linux": "/bin/local/miniconda/envs/recetox-aplcms/bin/radian", - "r.rpath.linux": "/bin/local/miniconda/envs/recetox-aplcms/bin/R" + "customizations": { + "vscode": { + "extensions": [ + "reditorsupport.r", + "rdebugger.r-debugger", + "eamodio.gitlens", + "mutantdino.resourcemonitor", + "meakbiyik.vscode-r-test-adapter", + "dvirtz.parquet-viewer", + "github.vscode-pull-request-github", + "ms-vsliveshare.vsliveshare", + "tianyishi.rmarkdown" + ], + "settings": { + "r.rterm.linux": "/opt/conda/envs/recetox-aplcms-dev/bin/radian", + "r.rpath.linux": "/opt/conda/envs/recetox-aplcms-dev/bin/R" + } + } }, - - "onCreateCommand": "apt update && apt install -y locales && locale-gen en_US.UTF-8 && git config --global --add safe.directory /workspaces/recetox-aplcms", - + "onCreateCommand": "apt update && apt install -y locales && locale-gen en_US.UTF-8 && apt-get update -y && apt-get install -y libxml2-dev && apt-get install -y libssl-dev && apt-get install -y libcurl4-openssl-dev && apt-get install -y libcgal-dev && apt-get install -y libglu1-mesa-dev && apt-get install -y wget && git config --global --add safe.directory /workspaces/recetox-aplcms && conda init && conda update -y conda && conda config --add channels conda-forge && conda config --add channels bioconda && conda config --set channel_priority strict && conda env create --file conda/environment-dev.yaml", "postAttachCommand": "/bin/bash" -} +} \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 14c4ce1b..972d4eee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [dev] - unreleased ### Added + - implemented new clustrering algorithm and included parallelism in unsupervised and hybrid [#201](https://github.com/RECETOX/recetox-aplcms/pull/201) ### Changed - refactored adaptive.bin and combine.seq.3 [#196](https://github.com/RECETOX/recetox-aplcms/pull/196) - refactored find.match [#193](https://github.com/RECETOX/recetox-aplcms/pull/193) diff --git a/NAMESPACE b/NAMESPACE index eb1bd7d3..1061d6aa 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,7 +1,5 @@ # Generated by roxygen2: do not edit by hand -S3method(solve,a) -S3method(solve,sigma) export(adaptive.bin) export(adjust.time) export(aggregate_by_rt) @@ -15,6 +13,10 @@ export(compute_breaks) export(compute_breaks_3) export(compute_chromatographic_profile) export(compute_clusters) +export(compute_clusters_simple) +export(compute_comb) +export(compute_corrected_features) +export(compute_curr_rec_with_enough_peaks) export(compute_delta_rt) export(compute_densities) export(compute_dx) @@ -26,33 +28,43 @@ export(compute_mass_density) export(compute_mass_values) export(compute_mu_sc_std) export(compute_mz_sd) +export(compute_peaks_and_valleys) +export(compute_pks_vlys_rt) +export(compute_rectangle) export(compute_rt_intervals_indices) export(compute_scale) +export(compute_sel) export(compute_start_bound) export(compute_target_times) export(compute_template) +export(compute_template_adjusted_rt) export(compute_uniq_grp) export(correct_time) +export(count_peaks) export(create_aligned_feature_table) export(create_output) export(create_rows) export(draw_rt_correction_plot) export(draw_rt_normal_peaks) export(duplicate.row.remove) +export(fill_missing_values) export(filter_based_on_density) -export(find.tol) export(find.turn.point) export(find_local_maxima) export(find_mz_match) +export(find_mz_tolerance) export(find_optima) export(get_custom_rt_tol) +export(get_features_in_rt_range) export(get_mzrange_bound_indices) export(get_num_workers) export(get_rt_region_indices) +export(get_single_occurrence_mask) export(get_times_to_use) export(hybrid) export(increment_counter) export(interpol.area) +export(l2normalize) export(label_val_to_keep) export(load.lcms) export(load_aligned_features) @@ -68,11 +80,12 @@ export(plot_raw_profile_histogram) export(plot_rt_profile) export(predict_mz_break_indices) export(predict_smoothed_rt) -export(prep.uv) +export(prep_uv) export(preprocess_bandwidth) export(preprocess_profile) export(prof.to.features) export(recover.weaker) +export(refine_selection) export(remove_noise) export(rev_cum_sum) export(rm.ridge) @@ -80,6 +93,8 @@ export(run_filter) export(select_mz) export(select_rt) export(semi.sup) +export(solve_a) +export(solve_sigma) export(sort_data) export(span) export(two.step.hybrid) diff --git a/R/adjust.time.R b/R/adjust.time.R index 86293231..47e4c215 100644 --- a/R/adjust.time.R +++ b/R/adjust.time.R @@ -2,6 +2,7 @@ NULL #> NULL +#' @export compute_comb <- function(template_features, features) { combined <- dplyr::bind_rows( template_features, @@ -11,6 +12,7 @@ compute_comb <- function(template_features, features) { return(combined) } +#' @export compute_sel <- function(combined, mz_tol_relative, rt_tol_relative) { l <- nrow(combined) sel <- which(combined$mz[2:l] - combined$mz[1:(l - 1)] < @@ -20,6 +22,7 @@ compute_sel <- function(combined, mz_tol_relative, rt_tol_relative) { return(sel) } +#' @export compute_template_adjusted_rt <- function(combined, sel, j) { all_features <- cbind(combined$rt[sel], combined$rt[sel + 1]) flip_indices <- which(combined$sample_id[sel] == j) @@ -34,6 +37,7 @@ compute_template_adjusted_rt <- function(combined, sel, j) { return(all_features) } +#' @export compute_corrected_features <- function(features, delta_rt, avg_time) { features <- features[order(features$rt, features$mz), ] corrected <- features$rt @@ -58,6 +62,7 @@ compute_corrected_features <- function(features, delta_rt, avg_time) { return(features) } +#' @export fill_missing_values <- function(orig.feature, this.feature) { missing_values <- which(is.na(this.feature$rt)) for (i in missing_values) { diff --git a/R/compute_clusters.R b/R/compute_clusters.R index 87d8dc51..402e70ac 100644 --- a/R/compute_clusters.R +++ b/R/compute_clusters.R @@ -2,13 +2,13 @@ #' @description #' Sort tibble based on sample_names #' @export -sort_data <- function(sample_names, feature_tables){ +sort_data <- function(sample_names, feature_tables) { index <- c() for (i in seq_along(sample_names)) { - index <- append(index, feature_tables[[i]]$sample_id[1]) + index <- append(index, feature_tables[[i]]$sample_id[1]) } - + index <- match(sample_names, index) feature_tables <- feature_tables[index] @@ -16,7 +16,7 @@ sort_data <- function(sample_names, feature_tables){ } #' Compute clusters of mz and rt and assign cluster id to individual features. -#' +#' #' @description #' Uses tolerances to group features with mz and rt within the tolerance into clusters, #' creating larger features from raw data points. Custom tolerances for mz and rt are @@ -34,7 +34,7 @@ sort_data <- function(sample_names, feature_tables){ #' \item feature_tables - list - Feature tables with added columns [sample_id, cluster]. #' \item rt_tol_relative - float - Newly determined relative rt tolerance. #' \item mz_tol_relative - float - Newly determined relative mz tolerance. -#'} +#' } #' @export compute_clusters <- function(feature_tables, mz_tol_relative, @@ -45,9 +45,9 @@ compute_clusters <- function(feature_tables, sample_names = NA) { number_of_samples <- length(feature_tables) all <- concatenate_feature_tables(feature_tables, sample_names) - + if (is.na(mz_tol_relative)) { - mz_tol_relative <- find.tol( + mz_tol_relative <- find_mz_tolerance( all$mz, mz_max_diff = mz_max_diff, aver.bin.size = 4000, @@ -74,37 +74,104 @@ compute_clusters <- function(feature_tables, ) } - res <- find.tol.time( - all, - number_of_samples = number_of_samples, - mz_tol_relative = mz_tol_relative, - rt_tol_relative = rt_tol_relative, - aver.bin.size = 200, - min.bins = 50, - max.bins = 100, - mz_tol_absolute = mz_tol_absolute, - max.num.segments = 10000, - do.plot = do.plot + aver.bin.size <- 200 + min.bins <- 50 + max.bins <- 100 + max.num.segments <- 10000 + + features <- dplyr::arrange_at(all, "mz") + min_mz_tol <- compute_min_mz_tolerance( + features$mz, + mz_tol_relative, + mz_tol_absolute ) - rt_tol_relative <- res$rt.tol + mz_breaks <- compute_breaks_3(features$mz, min_mz_tol) + features$mz_group <- 0 + + for (i in 2:length(mz_breaks)) { + subset_indices <- (mz_breaks[i - 1] + 1):mz_breaks[i] + features$mz_group[subset_indices] <- i + } + + features <- features |> dplyr::arrange_at(c("mz_group", "rt")) + + mz_breaks <- mz_breaks[c(-1, -length(mz_breaks))] + + if (is.na(rt_tol_relative)) { + rt_tol_relative <- compute_rt_tol_relative( + mz_breaks, + max.num.segments, + aver.bin.size, + number_of_samples, + features$rt, + min.bins, + max.bins + ) + } + + # compute breaks in rt domain + rt_diffs <- diff(features$rt) + rt_breaks <- which(rt_diffs > rt_tol_relative) + + # combine indices of all breaks in array and sort + all.breaks <- c(0, unique(c(mz_breaks, rt_breaks)), nrow(features)) + all.breaks <- all.breaks[order(all.breaks)] + + features$cluster <- 0 + for (i in 2:length(all.breaks)) { + features$cluster[(all.breaks[i - 1] + 1):all.breaks[i]] <- i + } - message("**** performing time correction ****") message(paste("m/z tolerance level: ", mz_tol_relative)) message(paste("time tolerance level:", rt_tol_relative)) - # Select features from individual samples, sort by mz and rt and + # Select features from individual samples, sort by mz and rt and # return the sorted tables as individual tibbles. - feature_tables <- res$features |> + feature_tables <- features |> + dplyr::select(-mz_group) |> dplyr::group_by(sample_id) |> dplyr::arrange_at(c("mz", "rt")) |> dplyr::group_split() - + feature_tables <- sort_data(sample_names, feature_tables) - + return(list(feature_tables = feature_tables, rt_tol_relative = rt_tol_relative, mz_tol_relative = mz_tol_relative)) } +#' Compute clusters using simple grouping based on numeric thresholds. +#' +#' @describtion +#' Features are first grouped in mz dimension based on the tolerance. +#' First, the absolute tolerance is computed for each feature, then a new group is started +#' once the difference between consecutive features is above this threshold. +#' The same process is then repeated for the retention time dimension. +#' The individual indices are then combined into a single index in the `cluster` columns. +#' @param feature_tables list of tibbles feature tables coming from all samples. +#' @param sample_names list of strings Sample names of the feature tables used to distinguish the samples. +#' @param mz_tol_ppm float Relative tolerance for mz grouping in parts per million. +#' @param rt_tol float Tolerance in retention time dimension [seconds]. +#' @return list of tibbles Feature tables passed initially with additional columns indicating the +#' mz and rt groups as well as the combined cluster index. +#' @export +compute_clusters_simple <- function(feature_tables, sample_names, mz_tol_ppm, rt_tol) { + all <- concatenate_feature_tables(feature_tables, sample_names) |> dplyr::arrange_at("mz") + + mz_tol_rel <- mz_tol_ppm * 1e-06 + mz_tol_abs <- all$mz * mz_tol_rel + + all |> + dplyr::mutate(mz_group = cumsum(c(0, diff(mz)) > mz_tol_abs)) |> + dplyr::group_by(mz_group) |> + dplyr::arrange_at("rt") |> + dplyr::mutate(rt_group = cumsum(c(0, diff(rt)) > rt_tol)) |> + dplyr::group_by(mz_group, rt_group) |> + dplyr::mutate(cluster = cur_group_id()) |> + dplyr::ungroup() |> + dplyr::arrange_at("cluster") |> + dplyr::group_by(sample_id) |> + dplyr::group_split() +} # compute_clusters_v2 <- function(feature_tables, mz_tol_ppm, rt_tol) { diff --git a/R/feature.align.R b/R/feature.align.R index 2fb725b4..12486898 100644 --- a/R/feature.align.R +++ b/R/feature.align.R @@ -134,11 +134,13 @@ create_aligned_feature_table <- function(features_table, if (!is(cluster, "cluster")) { cluster <- parallel::makeCluster(cluster) on.exit(parallel::stopCluster(cluster)) + + # NOTE: side effect (doParallel has no functionality to clean up) + doParallel::registerDoParallel(cluster) + register_functions_to_cluster(cluster) } - # NOTE: side effect (doParallel has no functionality to clean up) - doParallel::registerDoParallel(cluster) - register_functions_to_cluster(cluster) + number_of_samples <- length(sample_names) metadata_colnames <- c("id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", sample_names) diff --git a/R/find.tol.time.R b/R/find.tol.time.R index eeb5f1a4..1103c9b0 100644 --- a/R/find.tol.time.R +++ b/R/find.tol.time.R @@ -1,21 +1,3 @@ -#' Compute minimum mz tolerance to use. -#' @description -#' Compute the minimum mz tolerance based on the relative -#' tolerance and the mz values and the absolute tolerance. -#' Uses midpoints between mz values for the weighting. -#' @param mz vector Mz values to use. -#' @param mz_tol_relative float Relative mz tolerance to use with the mz values. -#' This forms a sort of weighted tolerance. -#' @param mz_tol_absolute float Absolute tolerance to use independent from the mz values. -#' @return float Minimum tolerance values to use. -compute_min_mz_tolerance <- function(mz, mz_tol_relative, mz_tol_absolute) { - l <- length(mz) - mz_midpoints <- ((mz[2:l] + mz[1:(l - 1)]) / 2) - mz_ftr_relative_tolerances <- mz_tol_relative * mz_midpoints - min_mz_tol <- min(mz_tol_absolute, mz_ftr_relative_tolerances) - return(min_mz_tol) -} - #' @description #' Compute indices of mass differences greater than min_mz_tol. #' @param mz mz values of all peaks in all profiles in the study. diff --git a/R/find.tol.R b/R/find_mz_tolerance.R similarity index 67% rename from R/find.tol.R rename to R/find_mz_tolerance.R index b7b10e35..db957efc 100644 --- a/R/find.tol.R +++ b/R/find_mz_tolerance.R @@ -2,6 +2,25 @@ NULL #> NULL +#' Compute minimum mz tolerance to use. +#' @description +#' Compute the minimum mz tolerance based on the relative +#' tolerance and the mz values and the absolute tolerance. +#' Uses midpoints between mz values for the weighting. +#' @param mz vector Mz values to use. +#' @param mz_tol_relative float Relative mz tolerance to use with the mz values. +#' This forms a sort of weighted tolerance. +#' @param mz_tol_absolute float Absolute tolerance to use independent from the mz values. +#' @return float Minimum tolerance values to use. +compute_min_mz_tolerance <- function(mz, mz_tol_relative, mz_tol_absolute) { + l <- length(mz) + mz_midpoints <- ((mz[2:l] + mz[1:(l - 1)]) / 2) + mz_ftr_relative_tolerances <- mz_tol_relative * mz_midpoints + min_mz_tol <- min(mz_tol_absolute, mz_ftr_relative_tolerances) + return(min_mz_tol) +} + + #' An internal function that is not supposed to be directly accessed by the user. Find m/z tolerance level. #' #' The function finds the tolerance level in m/z from a given vector of observed m/z values. @@ -14,38 +33,46 @@ NULL #' @param do.plot Indicates whether plot should be drawn. #' @return The tolerance level is returned. #' @export -find.tol <- function(mz, +find_mz_tolerance <- function(mz, mz_max_diff, aver.bin.size, min.bins, max.bins, do.plot) { - mz <- mz[order(mz)] + mz <- sort(mz) l <- length(mz) # pairwise m/z difference divided by their average, filtered outside of tolerance limit - distances <- (mz[2:l] - mz[1:(l - 1)]) / - ((mz[2:l] + mz[1:(l - 1)]) / 2) + pairwise_mean <- (mz[2:l] + mz[1:(l - 1)]) / 2 + distances <- diff(mz) / pairwise_mean distances <- distances[distances < mz_max_diff] # number of equally spaced points at which the density is to be estimated n <- min(max.bins, max(round(length(distances) / aver.bin.size), min.bins)) + # estimate probability density function of distances - des <- density(distances, kernel = "gaussian", n = n, - bw = mz_max_diff / n * 2, from = 0) + des <- density( + distances, + kernel = "gaussian", + n = n, + bw = mz_max_diff / n * 2, + from = 0 + ) # the n (-1?) coordinates of the points where the density is estimated points <- des$y[des$x > 0] # the estimated density values density_values <- des$x[des$x > 0] + + q1 <- max(distances) / 4 # select the upper 75% of the sorted data - top_data <- distances[distances > max(distances) / 4] - max(distances) / 4 + top_data <- distances[distances > q1] - q1 # parameter of the exponential distribution is estimated lambda <- MASS::fitdistr(top_data, "exponential")$estimate # values of the exponential distribution are calculated at equally spaced points estimated_density_values <- dexp(density_values, rate = lambda) # add the rest of the data - estimated_density_values <- estimated_density_values * sum(points[density_values > max(distances) / 4]) / - sum(estimated_density_values[density_values > max(distances) / 4]) + estimated_density_values <- estimated_density_values * sum(points[density_values > q1]) / + sum(estimated_density_values[density_values > q1]) # cutoff is selected where the density of the empirical distribution is >1.5 times the density of the exponential distribution cumulative <- cumsum(points > 1.5 * estimated_density_values) diff --git a/R/hybrid.R b/R/hybrid.R index 2e4c9bfa..b10896e9 100644 --- a/R/hybrid.R +++ b/R/hybrid.R @@ -390,7 +390,7 @@ hybrid <- function( message("**** time correction ****") - corrected <- foreach::foreach(this.feature = extracted_clusters$feature_tables) %do% correct_time( + corrected <- foreach::foreach(this.feature = extracted_clusters$feature_tables) %dopar% correct_time( this.feature, template_features, extracted_clusters$mz_tol_relative, @@ -429,7 +429,7 @@ hybrid <- function( ) message("**** weaker signal recovery ****") - recovered <- lapply(seq_along(filenames), function(i) { + recovered <- snow::parLapply(cluster, seq_along(filenames), function(i) { recover.weaker( filename = filenames[[i]], sample_name = sample_names[i], @@ -470,7 +470,7 @@ hybrid <- function( message("**** second time correction ****") - corrected <- foreach::foreach(this.feature = recovered_clusters$feature_tables) %do% correct_time( + corrected <- foreach::foreach(this.feature = recovered_clusters$feature_tables) %dopar% correct_time( this.feature, template_features, recovered_clusters$mz_tol_relative, diff --git a/R/prof.to.features.R b/R/prof.to.features.R index 05d128ac..357cc3a6 100644 --- a/R/prof.to.features.R +++ b/R/prof.to.features.R @@ -102,7 +102,7 @@ compute_gaussian_peak_shape <- function(rt_profile, bw, component_eliminate, BIC #' @param sigma.1 left standard deviation of the gaussian curve #' @param sigma.2 right standard deviation of the gaussian curve #' @export -solve.a <- function(x, t, a, sigma.1, sigma.2) { +solve_a <- function(x, t, a, sigma.1, sigma.2) { # This function is a part of bigauss.esti.EM and is not covered by any of test-cases w <- x * (as.numeric(t < a) / sigma.1 + as.numeric(t >= a) / sigma.2) return(sum(t * w) / sum(w)) @@ -115,7 +115,7 @@ solve.a <- function(x, t, a, sigma.1, sigma.2) { #' @param t A vector of numerical values (rt). #' @param a A vector of peak summits. #' @export -prep.uv <- function(x, t, a) { +prep_uv <- function(x, t, a) { # This function is a part of bigauss.esti.EM and is not covered by any of test-cases temp <- (t - a)^2 * x u <- sum(temp * as.numeric(t < a)) @@ -138,9 +138,9 @@ prep.uv <- function(x, t, a) { #' \item standard deviation at the right side of the gaussian curve #' } #' @export -solve.sigma <- function(x, t, a) { +solve_sigma <- function(x, t, a) { # This function is a part of bigauss.esti.EM and is not covered by any of test-cases - tt <- prep.uv(x, t, a) + tt <- prep_uv(x, t, a) sigma.1 <- tt$u / tt$x.sum * ((tt$v / tt$u)^(1 / 3) + 1) sigma.2 <- tt$v / tt$x.sum * ((tt$u / tt$v)^(1 / 3) + 1) return(list( @@ -192,11 +192,11 @@ bigauss.esti.EM <- function(t, x, max.iter = 50, epsilon = 0.005, do.plot = FALS while ((change > epsilon) & (n.iter < max.iter)) { a.old <- a.new n.iter <- n.iter + 1 - sigma <- solve.sigma(x, t, a.old) + sigma <- solve_sigma(x, t, a.old) if (n.iter == 1) { sigma[is.na(sigma)] <- as.numeric(sigma[which(!is.na(sigma))])[1] / 10 } - a.new <- solve.a(x, t, a.old, sigma$sigma.1, sigma$sigma.2) + a.new <- solve_a(x, t, a.old, sigma$sigma.1, sigma$sigma.2) change <- abs(a.old - a.new) } d <- x diff --git a/R/recover.weaker.R b/R/recover.weaker.R index 46082d24..eab14d79 100644 --- a/R/recover.weaker.R +++ b/R/recover.weaker.R @@ -58,6 +58,7 @@ compute_delta_rt <- function(times) { #' @description x / sum(x) #' @param x Data to normalize. #' @return Normalized data. +#' @export l2normalize <- function(x) { x / sum(x) } @@ -157,6 +158,7 @@ compute_target_times <- function(aligned_rts, #' checks which values only occur a single time. #' @param values vector Values for which to compute the mask. #' @return vector Boolean vector which is the mask of values occuring only once. +#' @export get_single_occurrence_mask <- function(values) { ttt <- table(values) mask <- values %in% as.numeric(names(ttt)[ttt == 1]) @@ -275,6 +277,7 @@ get_rt_region_indices <- function(target_time, features, rt_tol) { #' \item pks - vector - The data points at which the density peaks. #' \item vlys - vector - The points in the data where the density is low #' (forming a valley in the function). +#' @export get_features_in_rt_range <- function(features, times, bw) { time_curve <- times[between(times, min(features$rt), max(features$rt))] @@ -297,6 +300,7 @@ get_features_in_rt_range <- function(features, times, bw) { #' @param roi list Named list with vectors `pks` and `vlys`. #' @param times vector Retention time values #' @return vector Numbers of peaks within each region defined by a peak and the two valley points. +#' @export count_peaks <- function(roi, times) { num_peaks <- rep(0, length(roi$pks)) @@ -319,6 +323,7 @@ count_peaks <- function(roi, times) { #' \item pks - vector - The data points at which the density peaks with at least `recover_min_count` peaks between the valley points. #' \item vlys - vector - The points in the data where the density is low #' (forming a valley in the function). +#' @export compute_pks_vlys_rt <- function(features, times, bandwidth, target_rt, recover_min_count) { roi <- get_features_in_rt_range( features, @@ -383,6 +388,7 @@ compute_mu_sc_std <- function(features, aver_diff) { #' @param delta_rt vector Differences between consecutive retention time values (diff(times)). #' @importFrom dplyr between #' @return list Triplet of mz, label and intensity for the feature. +#' @export compute_curr_rec_with_enough_peaks <- function(mz, peak, valleys, @@ -448,6 +454,7 @@ compute_boundaries <- function(valley_points, peak) { #' \item vlys - vector - The points in the data where the density is low #' (forming a valley in the function). #' } +#' @export compute_peaks_and_valleys <- function(dens) { turns <- find.turn.point(dens$y) pks <- dens$x[turns$pks] # mz values with highest density @@ -474,6 +481,7 @@ compute_peaks_and_valleys <- function(dens) { #' @param min_bandwidth float Minimum bandwidth to use. #' @param max_bandwidth float Maximum bandwidth to use. #' @return tibble Tibble with `mz`, `rt` and `intensities` columns. +#' @export compute_rectangle <- function(data_table, aligned_feature_mz, breaks, @@ -556,7 +564,7 @@ compute_rectangle <- function(data_table, } else { rt_intensities <- dplyr::select( that.prof, - c("rt", "intensities") + all_of(c("rt", "intensities")) ) |> dplyr::arrange_at("rt") bw <- min(max(bandwidth * (span(rt_intensities$rt)), min_bandwidth), max_bandwidth) @@ -602,6 +610,7 @@ compute_rectangle <- function(data_table, #' @param rt_tol float Retention time tolerance. #' @param mz_tol float Mz tolerance to use. #' @return int Index of value in rectable closest to `target_rt` and `aligned_mz`. +#' @export refine_selection <- function(target_rt, rectangle, aligned_mz, rt_tol, mz_tol) { if (!is.na(target_rt)) { rt_term <- (rectangle$rt - target_rt)^2 / rt_tol^2 @@ -684,8 +693,8 @@ recover.weaker <- function(filename, vec_delta_rt <- compute_delta_rt(times) sample_intensities <- unlist(dplyr::select( - intensity_table %>% dplyr::rename_with(~ str_remove(., "_intensity")), - sample_name + intensity_table %>% dplyr::rename_with(~str_remove(., "_intensity")), + all_of(sample_name) ), use.names = FALSE) custom.mz.tol <- recover_mz_range * metadata_table$mz diff --git a/R/remove_noise.R b/R/remove_noise.R index 05d2ce81..07ca42d8 100644 --- a/R/remove_noise.R +++ b/R/remove_noise.R @@ -55,7 +55,7 @@ load_data <- function(filename, #' the machine's nominal accuracy level. Divide the ppm value by 1e6. For FTMS, 1e-5 is recommended. #' @param baseline_correct After grouping the observations, the highest intensity in each group is found. If #' the highest is lower than this value, the entire group will be deleted. -#' @param baseline_correct_noise_percentile The percentile of signal strength of those EIC that don't pass the +#' @param baseline_correct_noise_percentile Only used for plotting. The percentile of signal strength of those EIC that don't pass the #' run filter, to be used as the baseline threshold of signal strength. #' @param intensity_weighted Whether to use intensity to weight mass density estimation. #' @param do.plot Indicates whether plot should be drawn. diff --git a/R/unsupervised.R b/R/unsupervised.R index a41caf0b..09c39b20 100644 --- a/R/unsupervised.R +++ b/R/unsupervised.R @@ -180,7 +180,7 @@ unsupervised <- function( message("**** time correction ****") - corrected <- foreach::foreach(this.feature = extracted_clusters$feature_tables) %do% correct_time( + corrected <- foreach::foreach(this.feature = extracted_clusters$feature_tables) %dopar% correct_time( this.feature, template_features, extracted_clusters$mz_tol_relative, @@ -209,7 +209,7 @@ unsupervised <- function( ) message("**** weaker signal recovery ****") - recovered <- lapply(seq_along(filenames), function(i) { + recovered <- snow::parLapply(cluster, seq_along(filenames), function(i) { recover.weaker( filename = filenames[[i]], sample_name = sample_names[i], diff --git a/R/utils.R b/R/utils.R index 288f77e6..14f48277 100644 --- a/R/utils.R +++ b/R/utils.R @@ -40,7 +40,6 @@ register_functions_to_cluster <- function(cluster) { 'compute_e_step', 'compute_start_bound', 'compute_end_bound', - 'compute_bounds', 'compute_scale', 'span', 'compute_uniq_grp', @@ -53,9 +52,40 @@ register_functions_to_cluster <- function(cluster) { "find_optima", "filter_based_on_density", "create_output", - "comb" + "comb", + 'bigauss.esti.EM', + 'solve_sigma', + 'prep_uv', + 'solve_a', + 'correct_time', + 'compute_comb', + 'compute_sel', + 'compute_template_adjusted_rt', + 'compute_corrected_features', + 'fill_missing_values', + 'recover.weaker', + 'get_custom_rt_tol', + 'compute_target_times', + 'predict_mz_break_indices', + 'compute_rectangle', + 'get_rt_region_indices', + 'refine_selection', + 'duplicate.row.remove', + 'get_times_to_use', + 'get_single_occurrence_mask', + 'compute_curr_rec_with_enough_peaks', + 'compute_mu_sc_std', + 'compute_pks_vlys_rt', + 'count_peaks', + 'get_features_in_rt_range', + 'get_mzrange_bound_indices', + 'compute_mass_density', + 'l2normalize', + 'compute_peaks_and_valleys' )) snow::clusterEvalQ(cluster, library("dplyr")) + snow::clusterEvalQ(cluster, library("stringr")) + snow::clusterEvalQ(cluster, library("tidyselect")) } #' Concatenate multiple feature lists and add the sample id (origin of feature) as additional column. diff --git a/man/aggregate_by_rt.Rd b/man/aggregate_by_rt.Rd new file mode 100644 index 00000000..e81cb6ac --- /dev/null +++ b/man/aggregate_by_rt.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/aggregate_by_rt.R +\name{aggregate_by_rt} +\alias{aggregate_by_rt} +\title{Aggregate the intensities and select median mz for features with identical rt.} +\usage{ +aggregate_by_rt(features) +} +\arguments{ +\item{features}{dataframe of retention time, m/z ratio, signal strength.} +} +\value{ +returns a tibble with the following columns +\itemize{ + \item mz - m/z ratio + \item rt - retention time + \item intensities - signal strength +} +} +\description{ +This functions computes median mz and sum of intensities over features with same rt. +} diff --git a/man/augment_known_table.Rd b/man/augment_known_table.Rd new file mode 100644 index 00000000..a8c3126e --- /dev/null +++ b/man/augment_known_table.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/hybrid.R +\name{augment_known_table} +\alias{augment_known_table} +\title{Add newly detected aligned features to a known features table.} +\usage{ +augment_known_table( + aligned, + known_table, + match_tol_ppm, + mz_tol_relative, + rt_tol_relative, + new_feature_min_count +) +} +\arguments{ +\item{aligned}{A list object with three tibble tables: metadata, intensity, and rt.} + +\item{known_table}{A table of known/previously detected peaks.} + +\item{match_tol_ppm}{The ppm tolerance to match identified features to known metabolites/features.} + +\item{mz_tol_relative}{The m/z tolerance level for peak alignment. The default is NA, which allows the program to search for the +tolerance level based on the data. This value is expressed as the percentage of the m/z value. This value, multiplied by the m/z +value, becomes the cutoff level.} + +\item{rt_tol_relative}{The retention time tolerance level for peak alignment. The default is NA, which allows the program to search for +the tolerance level based on the data.} + +\item{new_feature_min_count}{The number of profiles a new feature must be present for it to be added to the database.} +} +\value{ +Known table with novel features. +} +\description{ +Add newly detected aligned features to a known features table. +} diff --git a/man/compute_boundaries.Rd b/man/compute_boundaries.Rd new file mode 100644 index 00000000..0c5086f1 --- /dev/null +++ b/man/compute_boundaries.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recover.weaker.R +\name{compute_boundaries} +\alias{compute_boundaries} +\title{Compute bounds of area using given peak and mass valley points.} +\usage{ +compute_boundaries(valley_points, peak) +} +\arguments{ +\item{valley_points}{vector values of valley points defining clusters.} + +\item{peak}{double Value of the peak for which to get the valley bounds.} +} +\value{ +Returns a list object with the following objects in it: +\itemize{ + \item lower - double - The value of the lower bound valley point. + \item upper - double - The value of the upper bound valley point. +} +} +\description{ +The lower bound is the mass of the valley the closest but smaller than peak +and the upper bound is the mass of the valley the closest but higher than +the peak. +} diff --git a/man/compute_clusters_simple.Rd b/man/compute_clusters_simple.Rd new file mode 100644 index 00000000..09ab9f3e --- /dev/null +++ b/man/compute_clusters_simple.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compute_clusters.R +\name{compute_clusters_simple} +\alias{compute_clusters_simple} +\title{Compute clusters using simple grouping based on numeric thresholds.} +\usage{ +compute_clusters_simple(feature_tables, sample_names, mz_tol_ppm, rt_tol) +} +\arguments{ +\item{feature_tables}{list of tibbles List of feature tables coming from all samples.} + +\item{sample_names}{list of strings Sample names of the feature tables used to distinguish the samples.} + +\item{mz_tol_ppm}{float Relative tolerance for mz grouping in parts per million.} + +\item{rt_tol}{float Tolerance in retention time dimension [seconds].} +} +\value{ +list of tibbles Feature tables passed initially with additional columns indicating the +mz and rt groups as well as the combined cluster index. +} +\description{ +Compute clusters using simple grouping based on numeric thresholds. +} diff --git a/man/compute_curr_rec_with_enough_peaks.Rd b/man/compute_curr_rec_with_enough_peaks.Rd new file mode 100644 index 00000000..bb673ce4 --- /dev/null +++ b/man/compute_curr_rec_with_enough_peaks.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recover.weaker.R +\name{compute_curr_rec_with_enough_peaks} +\alias{compute_curr_rec_with_enough_peaks} +\title{Compute the rectangle around recovered features given that enough peaks are present.} +\usage{ +compute_curr_rec_with_enough_peaks( + mz, + peak, + valleys, + features, + aver_diff, + times, + delta_rt +) +} +\arguments{ +\item{mz}{Mz value of the feature.} + +\item{peak}{Peak around which to detect the new feature.} + +\item{valleys}{Valley points to compute the boundary region.} + +\item{features}{tibble Tibble with `rt` and `intensities` column.} + +\item{aver_diff}{float Average retention time difference.} + +\item{times}{vector Raw retention time values from raw data file.} + +\item{delta_rt}{vector Differences between consecutive retention time values (diff(times)).} +} +\value{ +list Triplet of mz, label and intensity for the feature. +} +\description{ +Compute the rectangle around recovered features given that enough peaks are present. +} diff --git a/man/compute_delta_rt.Rd b/man/compute_delta_rt.Rd new file mode 100644 index 00000000..05fa5b66 --- /dev/null +++ b/man/compute_delta_rt.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recover.weaker.R +\name{compute_delta_rt} +\alias{compute_delta_rt} +\title{Compute custom smoothed h-method derivative of function.} +\usage{ +compute_delta_rt(times) +} +\arguments{ +\item{times}{Retention time values.} +} +\value{ +Differences between time values. +} +\description{ +The function adds an extrapolated element in the end and a 0 element in the front, +then computes the midpoints between#' neighbouring elements and then uses the `diff` +function to compute the changes in rt between points. +} diff --git a/man/compute_mass_density.Rd b/man/compute_mass_density.Rd new file mode 100644 index 00000000..601c94d1 --- /dev/null +++ b/man/compute_mass_density.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recover.weaker.R +\name{compute_mass_density} +\alias{compute_mass_density} +\title{Compute the density function of mz values.} +\usage{ +compute_mass_density(features, bandwidth, intensity_weighted, n = 512) +} +\arguments{ +\item{bandwidth}{Bandwidth to use to compute the kernel density.} + +\item{intensity_weighted}{Whether to use intensity weighting or not.} + +\item{mz}{Mass values to compute the density over.} + +\item{intensities}{Intensities of the peaks at mz values. +Only used if intensity_weighted == TRUE.} +} +\value{ +\link[stats]{density} object containing the densities. +} +\description{ +The function takes the mz values and uses \link[stats]{density} to +compute the local density, optionally using intensity based weighting. +} diff --git a/man/compute_min_mz_tolerance.Rd b/man/compute_min_mz_tolerance.Rd new file mode 100644 index 00000000..f228b26e --- /dev/null +++ b/man/compute_min_mz_tolerance.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/find_mz_tolerance.R +\name{compute_min_mz_tolerance} +\alias{compute_min_mz_tolerance} +\title{Compute minimum mz tolerance to use.} +\usage{ +compute_min_mz_tolerance(mz, mz_tol_relative, mz_tol_absolute) +} +\arguments{ +\item{mz}{vector Mz values to use.} + +\item{mz_tol_relative}{float Relative mz tolerance to use with the mz values. +This forms a sort of weighted tolerance.} + +\item{mz_tol_absolute}{float Absolute tolerance to use independent from the mz values.} +} +\value{ +float Minimum tolerance values to use. +} +\description{ +Compute the minimum mz tolerance based on the relative +tolerance and the mz values and the absolute tolerance. +Uses midpoints between mz values for the weighting. +} diff --git a/man/compute_mu_sc_std.Rd b/man/compute_mu_sc_std.Rd new file mode 100644 index 00000000..460842e3 --- /dev/null +++ b/man/compute_mu_sc_std.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recover.weaker.R +\name{compute_mu_sc_std} +\alias{compute_mu_sc_std} +\title{Compute interpolated retention time, its standard deviation, and intensity values,.} +\usage{ +compute_mu_sc_std(features, aver_diff) +} +\arguments{ +\item{features}{tibble Features with `rt` and `intensities` columns.} + +\item{aver_diff}{float Average retention time difference.} +} +\value{ + +} +\description{ +Compute interpolated retention time, its standard deviation, and intensity values,. +} diff --git a/man/compute_peaks_and_valleys.Rd b/man/compute_peaks_and_valleys.Rd new file mode 100644 index 00000000..3321bff7 --- /dev/null +++ b/man/compute_peaks_and_valleys.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recover.weaker.R +\name{compute_peaks_and_valleys} +\alias{compute_peaks_and_valleys} +\title{Compute peaks and valleys of density function.} +\usage{ +compute_peaks_and_valleys(dens) +} +\arguments{ +\item{density}{stats::density Density object for which to compute peaks +and valleys.} +} +\value{ +Returns a list object with the following objects in it: +\itemize{ + \item pks - vector - The data points at which the density peaks. + \item vlys - vector - The points in the data where the density is low + (forming a valley in the function). +} +} +\description{ +Given a density function, the turn points are computed and +the peaks and valleys in the original data (points with highest +and lowest density) are returned. +} diff --git a/man/compute_pks_vlys_rt.Rd b/man/compute_pks_vlys_rt.Rd new file mode 100644 index 00000000..00a46608 --- /dev/null +++ b/man/compute_pks_vlys_rt.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recover.weaker.R +\name{compute_pks_vlys_rt} +\alias{compute_pks_vlys_rt} +\title{Compute peaks and valleys which have at least `recover_min_count` peaks.} +\usage{ +compute_pks_vlys_rt(features, times, bandwidth, target_rt, recover_min_count) +} +\arguments{ +\item{features}{tibble Features with `mz`, `rt` and `intensities`.} + +\item{times}{vector Retention time values from the raw data file.} + +\item{bandwidth}{float Bandwidth to use in smoothing.} + +\item{target_rt}{float Retention time at which to recover the intensity.} + +\item{recover_min_count}{int Minimum number of peaks required in the area to recover the signal.} +} +\value{ + +} +\description{ +Compute peaks and valleys which have at least `recover_min_count` peaks. +} diff --git a/man/compute_rectangle.Rd b/man/compute_rectangle.Rd new file mode 100644 index 00000000..edb26654 --- /dev/null +++ b/man/compute_rectangle.Rd @@ -0,0 +1,61 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recover.weaker.R +\name{compute_rectangle} +\alias{compute_rectangle} +\title{Compute rectangle around feature with `aligned_feature_mz` and `target_rt` for recovery.} +\usage{ +compute_rectangle( + data_table, + aligned_feature_mz, + breaks, + custom_mz_tol, + orig_mz_tol, + use_intensity_weighting, + recover_min_count, + target_rt, + custom_rt_tol, + times, + delta_rt, + aver_diff, + bandwidth, + min_bandwidth, + max_bandwidth +) +} +\arguments{ +\item{data_table}{tibble Feature table with `mz`, `rt` and `intensities` column.} + +\item{aligned_feature_mz}{float Mz value of feature in aligned feature table.} + +\item{breaks}{vector Integer boundaries of clusters in mz values.} + +\item{custom_mz_tol}{float Custom mz tolerance for the feature.} + +\item{orig_mz_tol}{float Flat original mz tolerance to use.} + +\item{use_intensity_weighting}{bool Whether to use intensity weighting.} + +\item{recover_min_count}{int Minimum number of peaks required in the area to recover the signal.} + +\item{target_rt}{float Target retention time value.} + +\item{custom_rt_tol}{float Custom chromatographic tolerance to use.} + +\item{times}{vector Raw retention time values from raw data file.} + +\item{delta_rt}{vector Differences between consecutive retention time values (diff(times)).} + +\item{aver_diff}{float Average retention time difference.} + +\item{bandwidth}{float Bandwidth to use in smoothing.} + +\item{min_bandwidth}{float Minimum bandwidth to use.} + +\item{max_bandwidth}{float Maximum bandwidth to use.} +} +\value{ +tibble Tibble with `mz`, `rt` and `intensities` columns. +} +\description{ +Compute rectangle around feature with `aligned_feature_mz` and `target_rt` for recovery. +} diff --git a/man/compute_rt_tol_relative.Rd b/man/compute_rt_tol_relative.Rd new file mode 100644 index 00000000..2b479c4d --- /dev/null +++ b/man/compute_rt_tol_relative.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/find.tol.time.R +\name{compute_rt_tol_relative} +\alias{compute_rt_tol_relative} +\title{Compute rt relative tolerance to use.} +\usage{ +compute_rt_tol_relative( + breaks, + max.num.segments, + aver.bin.size, + number_of_samples, + rt, + min.bins, + max.bins, + do.plot = FALSE +) +} +\arguments{ +\item{max.num.segments}{the maximum number of segments.} + +\item{aver.bin.size}{The average bin size to determine the number of equally spaced points in the kernel density estimation.} + +\item{number_of_samples}{The number of spectra in this analysis.} + +\item{rt}{retention time of all peaks in all profiles in the study.} + +\item{min.bins}{the minimum number of bins to use in the kernel density estimation. It overrides aver.bin.size when too few observations are present.} + +\item{max.bins}{the maximum number of bins to use in the kernel density estimation. It overrides aver.bin.size when too many observations are present.} + +\item{do.plot}{Indicates whether plot should be drawn.} +} +\value{ +rt_tol_relative the elution time tolerance. +This conditional makes sure that length(s) is <= max.num.segments +If False, length(s) = max.num.segments, and s[i] is the largest +integer no greater than the corresponding element. Otherwise +length(s) = length(breaks) - 1 +This loop creates a vector with distances between rt peaks. Distances +are stored in a triangular matrix and converted to a vector subsequently. +Vector length should be < 100, otherwise, vector is +constructed extracting only 100 samples. +} +\description{ +Compute the elution time tolerance based on the kernel density estimation. +It plots the fitting function if set to TRUE. +} diff --git a/man/compute_target_times.Rd b/man/compute_target_times.Rd new file mode 100644 index 00000000..05f5fbc8 --- /dev/null +++ b/man/compute_target_times.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recover.weaker.R +\name{compute_target_times} +\alias{compute_target_times} +\title{Compute target times for regions of interest for recovery.} +\usage{ +compute_target_times( + aligned_rts, + original_sample_rts, + adjusted_sample_rts, + min_common_times = 4 +) +} +\arguments{ +\item{aligned_rts}{vector Aligned retention time values.} + +\item{original_sample_rts}{vector Original feature retention time values before correction.} + +\item{adjusted_sample_rts}{vector Feature retention time values after time correction.} +} +\description{ +Compute the individual target times for the features to be recovered in the sample. +Spline interpolation using \link[splines]{interpSpline} is used to map from adjusted times +back into the original times. The function requires `x` to be distinct, hence the filtering +to only include rt values that occurr only once in both lists. +} diff --git a/man/compute_uniq_grp.Rd b/man/compute_uniq_grp.Rd new file mode 100644 index 00000000..ec2aa5b6 --- /dev/null +++ b/man/compute_uniq_grp.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/run_filter.R +\name{compute_uniq_grp} +\alias{compute_uniq_grp} +\title{Computes unique groups} +\usage{ +compute_uniq_grp(profile, min_count_run, min_pres) +} +\arguments{ +\item{profile}{The matrix containing m/z, retention time, intensity, and EIC label as columns.} + +\item{min_count_run}{filter parameter.} + +\item{min_pres}{Run filter parameter. The minimum proportion of presence in the time period for a series of signals grouped +by m/z to be considered a peak.} +} +\value{ +unique_grp. +} +\description{ +Computes unique groups +} diff --git a/man/concatenate_feature_tables.Rd b/man/concatenate_feature_tables.Rd new file mode 100644 index 00000000..510bdd2e --- /dev/null +++ b/man/concatenate_feature_tables.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{concatenate_feature_tables} +\alias{concatenate_feature_tables} +\title{Concatenate multiple feature lists and add the sample id (origin of feature) as additional column.} +\usage{ +concatenate_feature_tables(features, sample_names) +} +\arguments{ +\item{features}{list List of tibbles containing extracted feature tables.} +} +\description{ +Concatenate multiple feature lists and add the sample id (origin of feature) as additional column. +} diff --git a/man/count_peaks.Rd b/man/count_peaks.Rd new file mode 100644 index 00000000..ed07b14b --- /dev/null +++ b/man/count_peaks.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recover.weaker.R +\name{count_peaks} +\alias{count_peaks} +\title{Count the number of peaks in all valleys} +\usage{ +count_peaks(roi, times) +} +\arguments{ +\item{roi}{list Named list with vectors `pks` and `vlys`.} + +\item{times}{vector Retention time values} +} +\value{ +vector Numbers of peaks within each region defined by a peak and the two valley points. +} +\description{ +For each peak in ROI, count the peaks between the valley points. +} diff --git a/man/create_aligned_feature_table.Rd b/man/create_aligned_feature_table.Rd new file mode 100644 index 00000000..034df37d --- /dev/null +++ b/man/create_aligned_feature_table.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/feature.align.R +\name{create_aligned_feature_table} +\alias{create_aligned_feature_table} +\title{Align peaks from spectra into a feature table.} +\usage{ +create_aligned_feature_table( + features_table, + min_occurrence, + sample_names, + rt_tol_relative, + mz_tol_relative, + cluster = 4 +) +} +\arguments{ +\item{features_table}{A list object. Each component is a matrix which is the output from compute_clusters().} + +\item{min_occurrence}{A feature has to show up in at least this number of profiles to be included in the final result.} + +\item{sample_names}{list List of sample names.} + +\item{rt_tol_relative}{The retention time tolerance level for peak alignment. The default is NA, which +allows the program to search for the tolerance level based on the data.} + +\item{mz_tol_relative}{The m/z tolerance level for peak alignment. The default is NA, which allows the +program to search for the tolerance level based on the data. This value is expressed as the +percentage of the m/z value. This value, multiplied by the m/z value, becomes the cutoff level.} + +\item{cluster}{The number of CPU cores to be used} +} +\value{ +A tibble with three tables containing aligned metadata, intensities an RTs. +} +\description{ +Align peaks from spectra into a feature table. +} diff --git a/man/duplicate.row.remove.Rd b/man/duplicate.row.remove.Rd new file mode 100644 index 00000000..fadc52ea --- /dev/null +++ b/man/duplicate.row.remove.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recover.weaker.R +\name{duplicate.row.remove} +\alias{duplicate.row.remove} +\title{Custom way of removing duplicate rows from a specifically formatted table.} +\usage{ +duplicate.row.remove(features, tolerance = 1e-10) +} +\arguments{ +\item{tolerance}{Tolerance to use for numeric comparisons.} + +\item{new.table}{The table from which the duplicate rows should be removed. Needs at least 5 columns. +Columns 1, 2 and 5 have to be of numeric type.} +} +\value{ +Returns the same table with duplicate rows removed. +} +\description{ +Rows are considered as duplicate if the 1st, 2nd and 5th column are less than 1e-10 (tolerance) apart. +Only a single row in this `range` is kept from a group. +} diff --git a/man/enrich_table_by_known_features.Rd b/man/enrich_table_by_known_features.Rd new file mode 100644 index 00000000..d322eaee --- /dev/null +++ b/man/enrich_table_by_known_features.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/hybrid.R +\name{enrich_table_by_known_features} +\alias{enrich_table_by_known_features} +\title{Add entries from the known features table to the aligned table.} +\usage{ +enrich_table_by_known_features( + aligned, + known_table, + match_tol_ppm, + mz_tol_relative, + rt_tol_relative +) +} +\arguments{ +\item{aligned}{A list object with three tibble tables: metadata, intensity, and rt.} + +\item{known_table}{A table of known/previously detected peaks.} + +\item{match_tol_ppm}{The ppm tolerance to match identified features to known metabolites/features.} + +\item{mz_tol_relative}{The m/z tolerance level for peak alignment. The default is NA, which allows the program to search for the +tolerance level based on the data. This value is expressed as the percentage of the m/z value. This value, multiplied by the m/z +value, becomes the cutoff level.} + +\item{rt_tol_relative}{The retention time tolerance level for peak alignment. The default is NA, which allows the program to search for +the tolerance level based on the data.} +} +\value{ +Aligned table with known features. +} +\description{ +Add entries from the known features table to the aligned table. +} diff --git a/man/find_mz_match.Rd b/man/find_mz_match.Rd new file mode 100644 index 00000000..2724f7b4 --- /dev/null +++ b/man/find_mz_match.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/hybrid.R +\name{find_mz_match} +\alias{find_mz_match} +\title{Compute matches between mz array and specific mass value with a tolerance.} +\usage{ +find_mz_match(sample_mz, known_mz, match_tol_ppm = 5) +} +\arguments{ +\item{sample_mz}{The mz array for which to compute the matching.} + +\item{known_mz}{The mz value with which to match.} + +\item{match_tol_ppm}{Matching tolerance in ppm.} +} +\value{ +Indicies of m/z values within the tolerance of any known m/z. +} +\description{ +Compute matches between mz array and specific mass value with a tolerance. +} +\examples{ +find_mz_match( + sample_mz = c(10, 20, 21), + known_mz = 20 +) +} diff --git a/man/find.tol.Rd b/man/find_mz_tolerance.Rd similarity index 83% rename from man/find.tol.Rd rename to man/find_mz_tolerance.Rd index c693e887..43d63e7f 100644 --- a/man/find.tol.Rd +++ b/man/find_mz_tolerance.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/find.tol.R -\name{find.tol} -\alias{find.tol} +% Please edit documentation in R/find_mz_tolerance.R +\name{find_mz_tolerance} +\alias{find_mz_tolerance} \title{An internal function that is not supposed to be directly accessed by the user. Find m/z tolerance level.} \usage{ -find.tol(mz, mz_max_diff, aver.bin.size, min.bins, max.bins, do.plot) +find_mz_tolerance(mz, mz_max_diff, aver.bin.size, min.bins, max.bins, do.plot) } \arguments{ \item{mz}{The vector of observed m/z values.} diff --git a/man/get_custom_rt_tol.Rd b/man/get_custom_rt_tol.Rd new file mode 100644 index 00000000..00d052cf --- /dev/null +++ b/man/get_custom_rt_tol.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recover.weaker.R +\name{get_custom_rt_tol} +\alias{get_custom_rt_tol} +\title{Compute custom chromatographic tolerance.} +\usage{ +get_custom_rt_tol(use_observed_range, peak_rts, rt_range, aligned_features) +} +\arguments{ +\item{use_observed_range}{bool Whether to use the observed chromatographic range for computation or not.} + +\item{peak_rts}{data.frame Retention time cross table with all peak rts.} + +\item{rt_range}{float Default chromatographic tolerance to use.} + +\item{aligned_features}{data.frame Aligned feature table.} +} +\value{ +vector Custom chromatographic tolerances to use for each feature. +} +\description{ +Compute chromatographic tolerance for each feature. If `use_observed_range == TRUE`, +the whole range of retention times for all peaks is used to compute the tolerance, +otherwise `rt_range` is used for each feature. +} diff --git a/man/get_features_in_rt_range.Rd b/man/get_features_in_rt_range.Rd new file mode 100644 index 00000000..0311079f --- /dev/null +++ b/man/get_features_in_rt_range.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recover.weaker.R +\name{get_features_in_rt_range} +\alias{get_features_in_rt_range} +\title{Get peaks and valleys of smoothed rt values in range.} +\usage{ +get_features_in_rt_range(features, times, bw) +} +\arguments{ +\item{features}{tibble Data table with `rt` and `intensities` columns.} + +\item{times}{vector Raw retention time data from raw data file.} + +\item{bw}{float Bandwidth to use for kernel smoothing.} +} +\value{ + +} +\description{ +Get peaks and valleys of smoothed rt values in range. +} diff --git a/man/get_mzrange_bound_indices.Rd b/man/get_mzrange_bound_indices.Rd new file mode 100644 index 00000000..f77b5f1c --- /dev/null +++ b/man/get_mzrange_bound_indices.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recover.weaker.R +\name{get_mzrange_bound_indices} +\alias{get_mzrange_bound_indices} +\title{Compute range of valley indices which are in mz_tol range around aligned_feature_mass.} +\usage{ +get_mzrange_bound_indices(aligned_feature_mass, mz, vlys_indices, mz_tol) +} +\arguments{ +\item{aligned_feature_mass}{float Mz value of the aligned feature.} + +\item{mz}{vector mz values of the features.} + +\item{vlys_indices}{vector Indices of the valley points of mz clusters.} + +\item{mz_tol}{float Tolerance to use to check if values are close.} +} +\value{ +pair Index range (start, end). +} +\description{ +Compute range of valley indices which are in mz_tol range around aligned_feature_mass. +} diff --git a/man/get_rt_region_indices.Rd b/man/get_rt_region_indices.Rd new file mode 100644 index 00000000..67cf2a22 --- /dev/null +++ b/man/get_rt_region_indices.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recover.weaker.R +\name{get_rt_region_indices} +\alias{get_rt_region_indices} +\title{Get indices where rt in `features` is within `rt_tol` of `target_time`.} +\usage{ +get_rt_region_indices(target_time, features, rt_tol) +} +\arguments{ +\item{target_time}{float Target retention time region.} + +\item{features}{tibble Feature table including `rt` column.} + +\item{rt_tol}{float Retention time tolerance.} +} +\value{ +vector Indices which are within `rt_tol` from `target_time` or +1 if `target_time` is NA. +} +\description{ +Get indices where rt in `features` is within `rt_tol` of `target_time`. +} diff --git a/man/get_single_occurrence_mask.Rd b/man/get_single_occurrence_mask.Rd new file mode 100644 index 00000000..4638b86c --- /dev/null +++ b/man/get_single_occurrence_mask.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recover.weaker.R +\name{get_single_occurrence_mask} +\alias{get_single_occurrence_mask} +\title{Get boolean mask for values that occur only once.} +\usage{ +get_single_occurrence_mask(values) +} +\arguments{ +\item{values}{vector Values for which to compute the mask.} +} +\value{ +vector Boolean vector which is the mask of values occuring only once. +} +\description{ +Uses the \link[base]{table} function to compute the occurrences and then +checks which values only occur a single time. +} diff --git a/man/get_times_to_use.Rd b/man/get_times_to_use.Rd new file mode 100644 index 00000000..f27d9c2a --- /dev/null +++ b/man/get_times_to_use.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recover.weaker.R +\name{get_times_to_use} +\alias{get_times_to_use} +\title{Get retention time values to use} +\usage{ +get_times_to_use(original_sample_rts, adjusted_sample_rts, cap = 2000) +} +\arguments{ +\item{original_sample_rts}{vector Original feature retention time values before correction.} + +\item{adjusted_sample_rts}{vector Feature retention time values after time correction.} + +\item{cap}{int Maximum number of time values to return.} +} +\value{ +Indices of retention time values to use. +} +\description{ +Obtain retention time values which occur only once in both the original and the adjusted times. +This is a custom version of the unique or intersection function with rounding etc. +} diff --git a/man/l2normalize.Rd b/man/l2normalize.Rd new file mode 100644 index 00000000..b871aeee --- /dev/null +++ b/man/l2normalize.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recover.weaker.R +\name{l2normalize} +\alias{l2normalize} +\title{Normalize vector so that sum(vec) = 1} +\usage{ +l2normalize(x) +} +\arguments{ +\item{x}{Data to normalize.} +} +\value{ +Normalized data. +} +\description{ +x / sum(x) +} diff --git a/man/label_val_to_keep.Rd b/man/label_val_to_keep.Rd new file mode 100644 index 00000000..ff65f677 --- /dev/null +++ b/man/label_val_to_keep.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/run_filter.R +\name{label_val_to_keep} +\alias{label_val_to_keep} +\title{This function labels the indices of values kept to perform further calculations} +\usage{ +label_val_to_keep(min_run, timeline, min_pres, this_times, times) +} +\arguments{ +\item{min_run}{Run filter parameter. The minimum length of elution time for a series of signals grouped by m/z to be considered a peak.} + +\item{min_pres}{Run filter parameter. The minimum proportion of presence in the time period for a series of signals grouped +by m/z to be considered a peak.} + +\item{times.}{Retention times vector.} +} +\value{ +to_keep. +} +\description{ +This function labels the indices of values kept to perform further calculations +} diff --git a/man/load_data.Rd b/man/load_data.Rd new file mode 100644 index 00000000..053ba8e3 --- /dev/null +++ b/man/load_data.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/remove_noise.R +\name{load_data} +\alias{load_data} +\title{Load data either from cache or load raw file and detect peaks.} +\usage{ +load_data(filename, cache, min_run, min_pres, mz_tol, intensity_weighted) +} +\description{ +THIS FUNCTION IS DEPRECATED! +} diff --git a/man/load_file.Rd b/man/load_file.Rd new file mode 100644 index 00000000..6a5be672 --- /dev/null +++ b/man/load_file.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/remove_noise.R +\name{load_file} +\alias{load_file} +\title{Load raw data from file} +\usage{ +load_file(filename) +} +\description{ +Load raw data from file +} diff --git a/man/match_peaks.Rd b/man/match_peaks.Rd new file mode 100644 index 00000000..2f785aa7 --- /dev/null +++ b/man/match_peaks.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/hybrid.R +\name{match_peaks} +\alias{match_peaks} +\title{Match peaks from sample table to already known peaks via similar m/z and rt.} +\usage{ +match_peaks( + aligned, + known_table, + match_tol_ppm, + mz_tol_relative, + rt_tol_relative +) +} +\arguments{ +\item{aligned}{A list object with three tibble tables: metadata, intensity, and rt.} + +\item{known_table}{A table of known/previously detected peaks.} + +\item{match_tol_ppm}{The ppm tolerance to match identified features to known metabolites/features.} + +\item{mz_tol_relative}{The m/z tolerance level for peak alignment. The default is NA, which allows the program to search for the +tolerance level based on the data. This value is expressed as the percentage of the m/z value. This value, multiplied by the m/z +value, becomes the cutoff level.} + +\item{rt_tol_relative}{The retention time tolerance level for peak alignment. The default is NA, which allows the program to search for +the tolerance level based on the data.} +} +\value{ +n x 2 matrix containing sample features-known features pairs. +} +\description{ +Match peaks from sample table to already known peaks via similar m/z and rt. +} diff --git a/man/peak_characterize.Rd b/man/peak_characterize.Rd new file mode 100644 index 00000000..f72277d0 --- /dev/null +++ b/man/peak_characterize.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/peak.characterize.R +\name{peak_characterize} +\alias{peak_characterize} +\title{Internal function: Updates the information of a feature for the known feature table.} +\usage{ +peak_characterize(existing_row = NA, metadata_row, ftrs_row, rt_row) +} +\arguments{ +\item{existing_row}{The existing row in the known feature table.} + +\item{ftrs_row}{The row of the matched feature in the new aligned feature table.} + +\item{rt_row}{The row of the matched feature in the new retention time table of aligned features.} +} +\value{ +A vector, the updated row for the known feature table. +} +\description{ +The function takes the information about the feature in the known feature table (if available), and updates it using the +information found in the current dataset. +} diff --git a/man/predict_mz_break_indices.Rd b/man/predict_mz_break_indices.Rd new file mode 100644 index 00000000..9c3f2a90 --- /dev/null +++ b/man/predict_mz_break_indices.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recover.weaker.R +\name{predict_mz_break_indices} +\alias{predict_mz_break_indices} +\title{Predict the indices for the valley points with low mass density.} +\usage{ +predict_mz_break_indices(features, mz_orig_tol) +} +\arguments{ +\item{features}{data.frame Data table with features for which to predict the indices.} + +\item{mz_orig_tol}{float Mz tolerance to use as KDE bandwidth parameter.} +} +\value{ +vector Predicted indices for valley points. +} +\description{ +The density of mz values in the feature table is computed based on the tolerance. +The valleys or breaks of clusters in mz values are detected and a function +is approximated to predict the indices for the mass values which are the closest to those +valley points. +} diff --git a/man/predict_smoothed_rt.Rd b/man/predict_smoothed_rt.Rd new file mode 100644 index 00000000..c2dc1fd0 --- /dev/null +++ b/man/predict_smoothed_rt.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/run_filter.R +\name{predict_smoothed_rt} +\alias{predict_smoothed_rt} +\title{Computes the smoothed retention times by using The Nadaraya-Watson kernel regression estimate function.} +\usage{ +predict_smoothed_rt(min_run, times) +} +\arguments{ +\item{min_run}{Run filter parameter. The minimum length of elution time for a series of signals grouped by m/z to be considered a peak.} + +\item{times.}{Retention times vector.} +} +\value{ +predicted rt. +} +\description{ +Computes the smoothed retention times by using The Nadaraya-Watson kernel regression estimate function. +} diff --git a/man/recetox.aplcms.Rd b/man/recetox.aplcms.Rd deleted file mode 100644 index c24ccb3a..00000000 --- a/man/recetox.aplcms.Rd +++ /dev/null @@ -1,20 +0,0 @@ -\name{recetox.aplcms} -\alias{recetox.aplcms-package} -\alias{recetox.aplcms} -\docType{package} -\title{ -Adaptive processing of LC/MS data -} -\description{ -The package generates a feature table from a batch of LC/MS spectra. It finds m/z and retention time tolerance levels from the data. A run-filter is used to detect peaks and remove noise. Non-parametric statistical methods are used to find-tune peak selection and grouping. After retention time correction, a feature table is generated by aligning peaks across spectra. -} -\author{ -Tianwei Yu -Maintainer: Tianwei Yu -} -\references{ -Bioinformatics. 25(15):1930-36. -BMC Bioinformatics. 11:559. -J. Proteome Res. 12(3):1419-27. -} -\keyword{ package } diff --git a/man/refine_selection.Rd b/man/refine_selection.Rd new file mode 100644 index 00000000..611aefe6 --- /dev/null +++ b/man/refine_selection.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recover.weaker.R +\name{refine_selection} +\alias{refine_selection} +\title{Refine the selection based on mz and rt differences.} +\usage{ +refine_selection(target_rt, rectangle, aligned_mz, rt_tol, mz_tol) +} +\arguments{ +\item{target_rt}{float Target retention time value.} + +\item{rectangle}{tibble Features with columns `rt` and `mz`.} + +\item{aligned_mz}{float Mz value in the aligned feature table of the +feature to be recovered.} + +\item{rt_tol}{float Retention time tolerance.} + +\item{mz_tol}{float Mz tolerance to use.} +} +\value{ +int Index of value in rectable closest to `target_rt` and `aligned_mz`. +} +\description{ +Refine the selection based on mz and rt differences. +} diff --git a/man/remove_noise.Rd b/man/remove_noise.Rd index 82949f31..debe3937 100644 --- a/man/remove_noise.Rd +++ b/man/remove_noise.Rd @@ -32,7 +32,7 @@ the machine's nominal accuracy level. Divide the ppm value by 1e6. For FTMS, 1e- \item{baseline_correct}{After grouping the observations, the highest intensity in each group is found. If the highest is lower than this value, the entire group will be deleted.} -\item{baseline_correct_noise_percentile}{The percentile of signal strength of those EIC that don't pass the +\item{baseline_correct_noise_percentile}{Only used for plotting. The percentile of signal strength of those EIC that don't pass the run filter, to be used as the baseline threshold of signal strength.} \item{intensity_weighted}{Whether to use intensity to weight mass density estimation.} diff --git a/tests/remote-files/clusters.txt b/tests/remote-files/clusters.txt index 03f22e5a..37d5e7bd 100644 --- a/tests/remote-files/clusters.txt +++ b/tests/remote-files/clusters.txt @@ -3,4 +3,7 @@ https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/ https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/clusters/RCX_08_shortened_extracted_clusters.parquet https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/clusters/RCX_06_shortened_adjusted_clusters.parquet https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/clusters/RCX_07_shortened_adjusted_clusters.parquet -https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/clusters/RCX_08_shortened_adjusted_clusters.parquet \ No newline at end of file +https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/clusters/RCX_08_shortened_adjusted_clusters.parquet +https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/clusters/8_qc_no_dil_milliq.parquet +https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/clusters/21_qc_no_dil_milliq.parquet +https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/clusters/29_qc_no_dil_milliq.parquet \ No newline at end of file diff --git a/tests/remote-files/extracted-whole-extracted.txt b/tests/remote-files/extracted-whole-extracted.txt index f33a65f4..623086df 100644 --- a/tests/remote-files/extracted-whole-extracted.txt +++ b/tests/remote-files/extracted-whole-extracted.txt @@ -1,3 +1,3 @@ -https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/extracted/8_qc_no_dil_milliq.parquet -https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/extracted/21_qc_no_dil_milliq.parquet -https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/extracted/29_qc_no_dil_milliq.parquet \ No newline at end of file +https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/extracted/8_qc_no_dil_milliq.mzml.parquet +https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/extracted/21_qc_no_dil_milliq.mzml.parquet +https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/extracted/29_qc_no_dil_milliq.mzml.parquet \ No newline at end of file diff --git a/tests/remote-files/extracted.txt b/tests/remote-files/extracted.txt index 242c643c..91e6d4b1 100644 --- a/tests/remote-files/extracted.txt +++ b/tests/remote-files/extracted.txt @@ -1,3 +1,6 @@ https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/extracted/RCX_06_shortened.parquet https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/extracted/RCX_07_shortened.parquet -https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/extracted/RCX_08_shortened.parquet \ No newline at end of file +https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/extracted/RCX_08_shortened.parquet +https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/extracted/8_qc_no_dil_milliq.mzml.parquet +https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/extracted/21_qc_no_dil_milliq.mzml.parquet +https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/extracted/29_qc_no_dil_milliq.mzml.parquet \ No newline at end of file diff --git a/tests/testthat/test-compute_clusters.R b/tests/testthat/test-compute_clusters.R index 1f1613dd..9449cbc8 100644 --- a/tests/testthat/test-compute_clusters.R +++ b/tests/testthat/test-compute_clusters.R @@ -51,3 +51,30 @@ patrick::with_parameters_test_that( ) ) ) + +test_that("compute clusters simple", { + testdata <- file.path("..", "testdata") + files <- c("8_qc_no_dil_milliq", "21_qc_no_dil_milliq", "29_qc_no_dil_milliq") + + extracted <- lapply(files, function(x) { + xx <- file.path(testdata, "extracted", paste0(x, ".mzml.parquet")) + tibble::as_tibble(arrow::read_parquet(xx)) + }) + actual <- compute_clusters_simple( + feature_tables = extracted, + sample_names = files, + mz_tol_ppm = 10, + rt_tol = 2 + ) + + actual <- actual[order(sapply(actual, function(x) x$sample_id[1]))] + + expected <- lapply(files, function(x) { + file <- file.path(testdata, "clusters", paste0(x, ".parquet")) + tibble::as_tibble(arrow::read_parquet(file)) + }) + + expected <- expected[order(sapply(expected, function(x) x$sample_id[1]))] + + expect_equal(as.list(actual), expected, tolerance = 0.02) +}) \ No newline at end of file diff --git a/tests/testthat/test-extract_features.R b/tests/testthat/test-extract_features.R index bcf5a69c..5989d2be 100644 --- a/tests/testthat/test-extract_features.R +++ b/tests/testthat/test-extract_features.R @@ -2,7 +2,7 @@ patrick::with_parameters_test_that( "extract single feature works", { skip_on_ci() - if (full_testdata) { + if (skip_tests) { skip("skipping whole data test case") } @@ -44,7 +44,7 @@ patrick::with_parameters_test_that( sd_cut = sd_cut, sigma_ratio_lim = sigma_ratio_lim, shape_model = "bi-Gaussian", - peak_estim_method = "moment", + peak_estim_method = peak_estim_method, component_eliminate = 0.01, moment_power = 1, BIC_factor = 2.0, @@ -69,18 +69,20 @@ patrick::with_parameters_test_that( intensity_weighted = FALSE, sd_cut = c(0.01, 500), sigma_ratio_lim = c(0.01, 100), - full_testdata = FALSE + peak_estim_method = "moment", + skip_tests = FALSE ), qc_no_dil_milliq = list( files = c("8_qc_no_dil_milliq.mzml", "21_qc_no_dil_milliq.mzml", "29_qc_no_dil_milliq.mzml"), - expected_files = c("8_qc_no_dil_milliq.parquet", "21_qc_no_dil_milliq.parquet", "29_qc_no_dil_milliq.parquet"), - mz_tol = 1e-05, - min_pres = 0.5, - min_run = 12, + expected_files = c("8_qc_no_dil_milliq.mzml.parquet", "21_qc_no_dil_milliq.mzml.parquet", "29_qc_no_dil_milliq.mzml.parquet"), + mz_tol = 5e-05, + min_pres = 0.7, + min_run = 0.5, intensity_weighted = FALSE, - sd_cut = c(0.01, 500), - sigma_ratio_lim = c(0.01, 100), - full_testdata = TRUE + sd_cut = c(0.05, 10), + sigma_ratio_lim = c(0, Inf), + peak_estim_method = "EM", + skip_tests = TRUE ) ) ) diff --git a/tests/testthat/test-find.tol.time.R b/tests/testthat/test-find.tol.time.R new file mode 100644 index 00000000..0264d585 --- /dev/null +++ b/tests/testthat/test-find.tol.time.R @@ -0,0 +1,21 @@ +test_that("compute_rt_tol_relative computes something", { + aver.bin.size <- 3 + min.bins <- 2 + max.bins <- 10 + max.num.segments <- 10 + number_of_samples <- 2 + rt <- c(0, 1, 2, 3, 3.5, 3.7, 3.9, 4, 100, 101, 101.5, 101.2, 105, 108, 108.1, 108.12, 108.3) + breaks <- c(1, 2, 3, 8, 13, 17) + + actual <- compute_rt_tol_relative( + breaks, + max.num.segments, + aver.bin.size, + number_of_samples, + rt, + min.bins, + max.bins + ) + + expect_equal(actual, 1.04167, tolerance=0.001) +}) diff --git a/tests/testthat/test-find_mz_tolerance.R b/tests/testthat/test-find_mz_tolerance.R new file mode 100644 index 00000000..1a75edc9 --- /dev/null +++ b/tests/testthat/test-find_mz_tolerance.R @@ -0,0 +1,15 @@ +test_that("mz tolerance is found", { + file <- file.path("..", "testdata", "extracted", "RCX_06_shortened.parquet") + mz <- arrow::read_parquet(file)$mz + + mz_tol_relative <- find_mz_tolerance( + mz, + mz_max_diff = 0.1, + aver.bin.size = 4000, + min.bins = 50, + max.bins = 200, + do.plot = FALSE + ) + + expect_equal(mz_tol_relative, 0.01664097, tolerance=0.001) +}) \ No newline at end of file diff --git a/tests/testthat/test-hybrid.R b/tests/testthat/test-hybrid.R index 0df675e2..15e5a3bd 100644 --- a/tests/testthat/test-hybrid.R +++ b/tests/testthat/test-hybrid.R @@ -30,21 +30,21 @@ patrick::with_parameters_test_that("basic hybrid test", { ) if (store_reports) { - report <- dataCompareR::rCompare( - actual, - expected, - keys = keys, - roundDigits = 3, - mismatches = 100000 - ) - dataCompareR::saveReport( - report, - reportName = paste0(.test_name, "_hybrid_report"), - showInViewer = FALSE, - HTMLReport = FALSE, - mismatchCount = 10000 - ) -} + report <- dataCompareR::rCompare( + actual, + expected, + keys = keys, + roundDigits = 3, + mismatches = 100000 + ) + dataCompareR::saveReport( + report, + reportName = paste0(.test_name, "_hybrid_report"), + showInViewer = FALSE, + HTMLReport = FALSE, + mismatchCount = 10000 + ) + } expect_equal(actual, expected) },