Skip to content

Commit

Permalink
Merge pull request #201 from hechth/198_clustering
Browse files Browse the repository at this point in the history
Implemented new version for clustering
  • Loading branch information
hechth authored May 3, 2023
2 parents 0543377 + 9bda7dc commit c81061b
Show file tree
Hide file tree
Showing 61 changed files with 1,252 additions and 160 deletions.
58 changes: 28 additions & 30 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,42 +2,40 @@
// https://github.com/microsoft/vscode-dev-containers/tree/v0.217.1/containers/docker-existing-dockerfile
{
"name": "recetox-aplcms-dev",

// Sets the run context to one level up instead of the .devcontainer folder.
"context": "..",

// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
"dockerFile": "../Dockerfile",

"image": "ubuntu:20.04",
"features": {
"git": {
"ghcr.io/devcontainers/features/git:1": {
"version": "latest",
"ppa": false,
"ppa": false
},
"common": {
"installZsh": "false",
"username": "false",
}
"ghcr.io/devcontainers/features/common-utils:2": {
"installZsh": "false"
},
"ghcr.io/devcontainers/features/github-cli:1": {},
"ghcr.io/devcontainers/features/conda:1": {}
},

// Add the IDs of extensions you want installed when the container is created.
"extensions": [
"reditorsupport.r",
"rdebugger.r-debugger",
"eamodio.gitlens",
"mutantdino.resourcemonitor",
"meakbiyik.vscode-r-test-adapter",
"dvirtz.parquet-viewer",
"github.vscode-pull-request-github",
"ms-vsliveshare.vsliveshare",
"tianyishi.rmarkdown"
],
"settings": {
"r.rterm.linux": "/bin/local/miniconda/envs/recetox-aplcms/bin/radian",
"r.rpath.linux": "/bin/local/miniconda/envs/recetox-aplcms/bin/R"
"customizations": {
"vscode": {
"extensions": [
"reditorsupport.r",
"rdebugger.r-debugger",
"eamodio.gitlens",
"mutantdino.resourcemonitor",
"meakbiyik.vscode-r-test-adapter",
"dvirtz.parquet-viewer",
"github.vscode-pull-request-github",
"ms-vsliveshare.vsliveshare",
"tianyishi.rmarkdown"
],
"settings": {
"r.rterm.linux": "/opt/conda/envs/recetox-aplcms-dev/bin/radian",
"r.rpath.linux": "/opt/conda/envs/recetox-aplcms-dev/bin/R"
}
}
},

"onCreateCommand": "apt update && apt install -y locales && locale-gen en_US.UTF-8 && git config --global --add safe.directory /workspaces/recetox-aplcms",

"onCreateCommand": "apt update && apt install -y locales && locale-gen en_US.UTF-8 && apt-get update -y && apt-get install -y libxml2-dev && apt-get install -y libssl-dev && apt-get install -y libcurl4-openssl-dev && apt-get install -y libcgal-dev && apt-get install -y libglu1-mesa-dev && apt-get install -y wget && git config --global --add safe.directory /workspaces/recetox-aplcms && conda init && conda update -y conda && conda config --add channels conda-forge && conda config --add channels bioconda && conda config --set channel_priority strict && conda env create --file conda/environment-dev.yaml",
"postAttachCommand": "/bin/bash"
}
}
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [dev] - unreleased
### Added
- implemented new clustrering algorithm and included parallelism in unsupervised and hybrid [#201](https://github.com/RECETOX/recetox-aplcms/pull/201)
### Changed
- refactored adaptive.bin and combine.seq.3 [#196](https://github.com/RECETOX/recetox-aplcms/pull/196)
- refactored find.match [#193](https://github.com/RECETOX/recetox-aplcms/pull/193)
Expand Down
23 changes: 19 additions & 4 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
# Generated by roxygen2: do not edit by hand

S3method(solve,a)
S3method(solve,sigma)
export(adaptive.bin)
export(adjust.time)
export(aggregate_by_rt)
Expand All @@ -15,6 +13,10 @@ export(compute_breaks)
export(compute_breaks_3)
export(compute_chromatographic_profile)
export(compute_clusters)
export(compute_clusters_simple)
export(compute_comb)
export(compute_corrected_features)
export(compute_curr_rec_with_enough_peaks)
export(compute_delta_rt)
export(compute_densities)
export(compute_dx)
Expand All @@ -26,33 +28,43 @@ export(compute_mass_density)
export(compute_mass_values)
export(compute_mu_sc_std)
export(compute_mz_sd)
export(compute_peaks_and_valleys)
export(compute_pks_vlys_rt)
export(compute_rectangle)
export(compute_rt_intervals_indices)
export(compute_scale)
export(compute_sel)
export(compute_start_bound)
export(compute_target_times)
export(compute_template)
export(compute_template_adjusted_rt)
export(compute_uniq_grp)
export(correct_time)
export(count_peaks)
export(create_aligned_feature_table)
export(create_output)
export(create_rows)
export(draw_rt_correction_plot)
export(draw_rt_normal_peaks)
export(duplicate.row.remove)
export(fill_missing_values)
export(filter_based_on_density)
export(find.tol)
export(find.turn.point)
export(find_local_maxima)
export(find_mz_match)
export(find_mz_tolerance)
export(find_optima)
export(get_custom_rt_tol)
export(get_features_in_rt_range)
export(get_mzrange_bound_indices)
export(get_num_workers)
export(get_rt_region_indices)
export(get_single_occurrence_mask)
export(get_times_to_use)
export(hybrid)
export(increment_counter)
export(interpol.area)
export(l2normalize)
export(label_val_to_keep)
export(load.lcms)
export(load_aligned_features)
Expand All @@ -68,18 +80,21 @@ export(plot_raw_profile_histogram)
export(plot_rt_profile)
export(predict_mz_break_indices)
export(predict_smoothed_rt)
export(prep.uv)
export(prep_uv)
export(preprocess_bandwidth)
export(preprocess_profile)
export(prof.to.features)
export(recover.weaker)
export(refine_selection)
export(remove_noise)
export(rev_cum_sum)
export(rm.ridge)
export(run_filter)
export(select_mz)
export(select_rt)
export(semi.sup)
export(solve_a)
export(solve_sigma)
export(sort_data)
export(span)
export(two.step.hybrid)
Expand Down
5 changes: 5 additions & 0 deletions R/adjust.time.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
NULL
#> NULL

#' @export
compute_comb <- function(template_features, features) {
combined <- dplyr::bind_rows(
template_features,
Expand All @@ -11,6 +12,7 @@ compute_comb <- function(template_features, features) {
return(combined)
}

#' @export
compute_sel <- function(combined, mz_tol_relative, rt_tol_relative) {
l <- nrow(combined)
sel <- which(combined$mz[2:l] - combined$mz[1:(l - 1)] <
Expand All @@ -20,6 +22,7 @@ compute_sel <- function(combined, mz_tol_relative, rt_tol_relative) {
return(sel)
}

#' @export
compute_template_adjusted_rt <- function(combined, sel, j) {
all_features <- cbind(combined$rt[sel], combined$rt[sel + 1])
flip_indices <- which(combined$sample_id[sel] == j)
Expand All @@ -34,6 +37,7 @@ compute_template_adjusted_rt <- function(combined, sel, j) {
return(all_features)
}

#' @export
compute_corrected_features <- function(features, delta_rt, avg_time) {
features <- features[order(features$rt, features$mz), ]
corrected <- features$rt
Expand All @@ -58,6 +62,7 @@ compute_corrected_features <- function(features, delta_rt, avg_time) {
return(features)
}

#' @export
fill_missing_values <- function(orig.feature, this.feature) {
missing_values <- which(is.na(this.feature$rt))
for (i in missing_values) {
Expand Down
115 changes: 91 additions & 24 deletions R/compute_clusters.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,21 @@
#' @description
#' Sort tibble based on sample_names
#' @export
sort_data <- function(sample_names, feature_tables){
sort_data <- function(sample_names, feature_tables) {
index <- c()
for (i in seq_along(sample_names))
{
index <- append(index, feature_tables[[i]]$sample_id[1])
index <- append(index, feature_tables[[i]]$sample_id[1])
}

index <- match(sample_names, index)
feature_tables <- feature_tables[index]

return(feature_tables)
}

#' Compute clusters of mz and rt and assign cluster id to individual features.
#'
#'
#' @description
#' Uses tolerances to group features with mz and rt within the tolerance into clusters,
#' creating larger features from raw data points. Custom tolerances for mz and rt are
Expand All @@ -34,7 +34,7 @@ sort_data <- function(sample_names, feature_tables){
#' \item feature_tables - list - Feature tables with added columns [sample_id, cluster].
#' \item rt_tol_relative - float - Newly determined relative rt tolerance.
#' \item mz_tol_relative - float - Newly determined relative mz tolerance.
#'}
#' }
#' @export
compute_clusters <- function(feature_tables,
mz_tol_relative,
Expand All @@ -45,9 +45,9 @@ compute_clusters <- function(feature_tables,
sample_names = NA) {
number_of_samples <- length(feature_tables)
all <- concatenate_feature_tables(feature_tables, sample_names)

if (is.na(mz_tol_relative)) {
mz_tol_relative <- find.tol(
mz_tol_relative <- find_mz_tolerance(
all$mz,
mz_max_diff = mz_max_diff,
aver.bin.size = 4000,
Expand All @@ -74,37 +74,104 @@ compute_clusters <- function(feature_tables,
)
}

res <- find.tol.time(
all,
number_of_samples = number_of_samples,
mz_tol_relative = mz_tol_relative,
rt_tol_relative = rt_tol_relative,
aver.bin.size = 200,
min.bins = 50,
max.bins = 100,
mz_tol_absolute = mz_tol_absolute,
max.num.segments = 10000,
do.plot = do.plot
aver.bin.size <- 200
min.bins <- 50
max.bins <- 100
max.num.segments <- 10000

features <- dplyr::arrange_at(all, "mz")
min_mz_tol <- compute_min_mz_tolerance(
features$mz,
mz_tol_relative,
mz_tol_absolute
)

rt_tol_relative <- res$rt.tol
mz_breaks <- compute_breaks_3(features$mz, min_mz_tol)
features$mz_group <- 0

for (i in 2:length(mz_breaks)) {
subset_indices <- (mz_breaks[i - 1] + 1):mz_breaks[i]
features$mz_group[subset_indices] <- i
}

features <- features |> dplyr::arrange_at(c("mz_group", "rt"))

mz_breaks <- mz_breaks[c(-1, -length(mz_breaks))]

if (is.na(rt_tol_relative)) {
rt_tol_relative <- compute_rt_tol_relative(
mz_breaks,
max.num.segments,
aver.bin.size,
number_of_samples,
features$rt,
min.bins,
max.bins
)
}

# compute breaks in rt domain
rt_diffs <- diff(features$rt)
rt_breaks <- which(rt_diffs > rt_tol_relative)

# combine indices of all breaks in array and sort
all.breaks <- c(0, unique(c(mz_breaks, rt_breaks)), nrow(features))
all.breaks <- all.breaks[order(all.breaks)]

features$cluster <- 0
for (i in 2:length(all.breaks)) {
features$cluster[(all.breaks[i - 1] + 1):all.breaks[i]] <- i
}

message("**** performing time correction ****")
message(paste("m/z tolerance level: ", mz_tol_relative))
message(paste("time tolerance level:", rt_tol_relative))

# Select features from individual samples, sort by mz and rt and
# Select features from individual samples, sort by mz and rt and
# return the sorted tables as individual tibbles.
feature_tables <- res$features |>
feature_tables <- features |>
dplyr::select(-mz_group) |>
dplyr::group_by(sample_id) |>
dplyr::arrange_at(c("mz", "rt")) |>
dplyr::group_split()

feature_tables <- sort_data(sample_names, feature_tables)

return(list(feature_tables = feature_tables, rt_tol_relative = rt_tol_relative, mz_tol_relative = mz_tol_relative))
}

#' Compute clusters using simple grouping based on numeric thresholds.
#'
#' @describtion
#' Features are first grouped in mz dimension based on the tolerance.
#' First, the absolute tolerance is computed for each feature, then a new group is started
#' once the difference between consecutive features is above this threshold.
#' The same process is then repeated for the retention time dimension.
#' The individual indices are then combined into a single index in the `cluster` columns.
#' @param feature_tables list of tibbles feature tables coming from all samples.
#' @param sample_names list of strings Sample names of the feature tables used to distinguish the samples.
#' @param mz_tol_ppm float Relative tolerance for mz grouping in parts per million.
#' @param rt_tol float Tolerance in retention time dimension [seconds].
#' @return list of tibbles Feature tables passed initially with additional columns indicating the
#' mz and rt groups as well as the combined cluster index.
#' @export
compute_clusters_simple <- function(feature_tables, sample_names, mz_tol_ppm, rt_tol) {
all <- concatenate_feature_tables(feature_tables, sample_names) |> dplyr::arrange_at("mz")

mz_tol_rel <- mz_tol_ppm * 1e-06
mz_tol_abs <- all$mz * mz_tol_rel

all |>
dplyr::mutate(mz_group = cumsum(c(0, diff(mz)) > mz_tol_abs)) |>
dplyr::group_by(mz_group) |>
dplyr::arrange_at("rt") |>
dplyr::mutate(rt_group = cumsum(c(0, diff(rt)) > rt_tol)) |>
dplyr::group_by(mz_group, rt_group) |>
dplyr::mutate(cluster = cur_group_id()) |>
dplyr::ungroup() |>
dplyr::arrange_at("cluster") |>
dplyr::group_by(sample_id) |>
dplyr::group_split()
}


# compute_clusters_v2 <- function(feature_tables, mz_tol_ppm, rt_tol) {
Expand Down
8 changes: 5 additions & 3 deletions R/feature.align.R
Original file line number Diff line number Diff line change
Expand Up @@ -134,11 +134,13 @@ create_aligned_feature_table <- function(features_table,
if (!is(cluster, "cluster")) {
cluster <- parallel::makeCluster(cluster)
on.exit(parallel::stopCluster(cluster))

# NOTE: side effect (doParallel has no functionality to clean up)
doParallel::registerDoParallel(cluster)
register_functions_to_cluster(cluster)
}

# NOTE: side effect (doParallel has no functionality to clean up)
doParallel::registerDoParallel(cluster)
register_functions_to_cluster(cluster)


number_of_samples <- length(sample_names)
metadata_colnames <- c("id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", sample_names)
Expand Down
Loading

0 comments on commit c81061b

Please sign in to comment.