Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implemented new version for clustering #201

Merged
merged 36 commits into from
May 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
a14b5b8
updated devcontainer
hechth Apr 19, 2023
0feb630
updated devcontainer
hechth Apr 19, 2023
648c216
updated devcontainer
hechth Apr 19, 2023
7111569
unpacked find.tol function
hechth Apr 19, 2023
0145536
Merge branch 'master' into 198_clustering
hechth Apr 19, 2023
5789de6
Merge remote-tracking branch 'origin/62_devcontainer' into 198_cluste…
hechth Apr 19, 2023
0acb9ea
updated paths and added container and remote user
hechth Apr 19, 2023
e301373
Implemented new method and added unit test
hechth Apr 19, 2023
fcdd904
renamed find.tol to find_mz_tolerance and implemented unit test
hechth Apr 19, 2023
c88677c
updated test to extract features
hechth Apr 21, 2023
5bf6b55
updated documentation
hechth Apr 21, 2023
bb11c07
removed username section from devcontainer
hechth Apr 21, 2023
7ac877d
moved function to compute min_mz_tolerance
hechth Apr 21, 2023
9c42b46
updated compute clusters test with actual assertion
hechth Apr 21, 2023
1152424
added radian to initialization
hechth Apr 25, 2023
4bc3735
Removed outdocumented code and added docstring
hechth Apr 25, 2023
4954a96
updated testdata pulling
hechth Apr 25, 2023
c970a42
added test stub to compute time tolerance
hechth Apr 25, 2023
891d2fe
implemented test to find time tolerance
hechth Apr 25, 2023
cec9f8a
removed all documentation files
hechth Apr 25, 2023
9da0cb7
Merge branch 'RECETOX:master' into 198_clustering
hechth Apr 25, 2023
ef50d69
added all extracted files to normal extracted testdata
hechth Apr 27, 2023
0327824
small bugfix in devcontainer
hechth Apr 27, 2023
7fcf67b
Merge branch 'master' into 198_clustering
hechth Apr 27, 2023
782c86b
fixed parallel execution in hybrid and unsupervised
hechth Apr 28, 2023
2ac6a87
fixed warnings
hechth Apr 28, 2023
5a6543d
fixed indentation
hechth Apr 28, 2023
8c2fbae
Merge branch '198_clustering' of github.com:hechth/recetox-aplcms int…
hechth Apr 28, 2023
58789ba
updated changelog
hechth Apr 28, 2023
1a1bf28
Update R/compute_clusters.R
hechth May 2, 2023
44defc1
Update tests/testthat/test-find.tol.time.R
hechth May 2, 2023
6d6ed95
Merge branch 'master' into 198_clustering
hechth May 2, 2023
60021c0
re-added documentation
hechth May 3, 2023
0f0bae4
Merge branch '198_clustering' of github.com:hechth/recetox-aplcms int…
hechth May 3, 2023
b5a61ce
fixed export statements for functions
hechth May 3, 2023
9bda7dc
Update R/compute_clusters.R
hechth May 3, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 28 additions & 30 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,42 +2,40 @@
// https://github.com/microsoft/vscode-dev-containers/tree/v0.217.1/containers/docker-existing-dockerfile
{
"name": "recetox-aplcms-dev",

// Sets the run context to one level up instead of the .devcontainer folder.
"context": "..",

// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
"dockerFile": "../Dockerfile",

"image": "ubuntu:20.04",
"features": {
"git": {
"ghcr.io/devcontainers/features/git:1": {
"version": "latest",
"ppa": false,
"ppa": false
},
"common": {
"installZsh": "false",
"username": "false",
}
"ghcr.io/devcontainers/features/common-utils:2": {
"installZsh": "false"
},
"ghcr.io/devcontainers/features/github-cli:1": {},
"ghcr.io/devcontainers/features/conda:1": {}
},

// Add the IDs of extensions you want installed when the container is created.
"extensions": [
"reditorsupport.r",
"rdebugger.r-debugger",
"eamodio.gitlens",
"mutantdino.resourcemonitor",
"meakbiyik.vscode-r-test-adapter",
"dvirtz.parquet-viewer",
"github.vscode-pull-request-github",
"ms-vsliveshare.vsliveshare",
"tianyishi.rmarkdown"
],
"settings": {
"r.rterm.linux": "/bin/local/miniconda/envs/recetox-aplcms/bin/radian",
"r.rpath.linux": "/bin/local/miniconda/envs/recetox-aplcms/bin/R"
"customizations": {
"vscode": {
"extensions": [
"reditorsupport.r",
"rdebugger.r-debugger",
"eamodio.gitlens",
"mutantdino.resourcemonitor",
"meakbiyik.vscode-r-test-adapter",
"dvirtz.parquet-viewer",
"github.vscode-pull-request-github",
"ms-vsliveshare.vsliveshare",
"tianyishi.rmarkdown"
],
"settings": {
"r.rterm.linux": "/opt/conda/envs/recetox-aplcms-dev/bin/radian",
"r.rpath.linux": "/opt/conda/envs/recetox-aplcms-dev/bin/R"
}
}
},

"onCreateCommand": "apt update && apt install -y locales && locale-gen en_US.UTF-8 && git config --global --add safe.directory /workspaces/recetox-aplcms",

"onCreateCommand": "apt update && apt install -y locales && locale-gen en_US.UTF-8 && apt-get update -y && apt-get install -y libxml2-dev && apt-get install -y libssl-dev && apt-get install -y libcurl4-openssl-dev && apt-get install -y libcgal-dev && apt-get install -y libglu1-mesa-dev && apt-get install -y wget && git config --global --add safe.directory /workspaces/recetox-aplcms && conda init && conda update -y conda && conda config --add channels conda-forge && conda config --add channels bioconda && conda config --set channel_priority strict && conda env create --file conda/environment-dev.yaml",
"postAttachCommand": "/bin/bash"
}
}
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [dev] - unreleased
### Added
- implemented new clustrering algorithm and included parallelism in unsupervised and hybrid [#201](https://github.com/RECETOX/recetox-aplcms/pull/201)
### Changed
- refactored adaptive.bin and combine.seq.3 [#196](https://github.com/RECETOX/recetox-aplcms/pull/196)
- refactored find.match [#193](https://github.com/RECETOX/recetox-aplcms/pull/193)
Expand Down
23 changes: 19 additions & 4 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
# Generated by roxygen2: do not edit by hand

S3method(solve,a)
S3method(solve,sigma)
export(adaptive.bin)
export(adjust.time)
export(aggregate_by_rt)
Expand All @@ -15,6 +13,10 @@ export(compute_breaks)
export(compute_breaks_3)
export(compute_chromatographic_profile)
export(compute_clusters)
export(compute_clusters_simple)
export(compute_comb)
export(compute_corrected_features)
export(compute_curr_rec_with_enough_peaks)
export(compute_delta_rt)
export(compute_densities)
export(compute_dx)
Expand All @@ -26,33 +28,43 @@ export(compute_mass_density)
export(compute_mass_values)
export(compute_mu_sc_std)
export(compute_mz_sd)
export(compute_peaks_and_valleys)
export(compute_pks_vlys_rt)
export(compute_rectangle)
export(compute_rt_intervals_indices)
export(compute_scale)
export(compute_sel)
export(compute_start_bound)
export(compute_target_times)
export(compute_template)
export(compute_template_adjusted_rt)
export(compute_uniq_grp)
export(correct_time)
export(count_peaks)
export(create_aligned_feature_table)
export(create_output)
export(create_rows)
export(draw_rt_correction_plot)
export(draw_rt_normal_peaks)
export(duplicate.row.remove)
export(fill_missing_values)
export(filter_based_on_density)
export(find.tol)
export(find.turn.point)
export(find_local_maxima)
export(find_mz_match)
export(find_mz_tolerance)
export(find_optima)
export(get_custom_rt_tol)
export(get_features_in_rt_range)
export(get_mzrange_bound_indices)
export(get_num_workers)
export(get_rt_region_indices)
export(get_single_occurrence_mask)
export(get_times_to_use)
export(hybrid)
export(increment_counter)
export(interpol.area)
export(l2normalize)
export(label_val_to_keep)
export(load.lcms)
export(load_aligned_features)
Expand All @@ -68,18 +80,21 @@ export(plot_raw_profile_histogram)
export(plot_rt_profile)
export(predict_mz_break_indices)
export(predict_smoothed_rt)
export(prep.uv)
export(prep_uv)
export(preprocess_bandwidth)
export(preprocess_profile)
export(prof.to.features)
export(recover.weaker)
export(refine_selection)
export(remove_noise)
export(rev_cum_sum)
export(rm.ridge)
export(run_filter)
export(select_mz)
export(select_rt)
export(semi.sup)
export(solve_a)
export(solve_sigma)
export(sort_data)
export(span)
export(two.step.hybrid)
Expand Down
5 changes: 5 additions & 0 deletions R/adjust.time.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
NULL
#> NULL

#' @export
compute_comb <- function(template_features, features) {
combined <- dplyr::bind_rows(
template_features,
Expand All @@ -11,6 +12,7 @@ compute_comb <- function(template_features, features) {
return(combined)
}

#' @export
compute_sel <- function(combined, mz_tol_relative, rt_tol_relative) {
l <- nrow(combined)
sel <- which(combined$mz[2:l] - combined$mz[1:(l - 1)] <
Expand All @@ -20,6 +22,7 @@ compute_sel <- function(combined, mz_tol_relative, rt_tol_relative) {
return(sel)
}

#' @export
compute_template_adjusted_rt <- function(combined, sel, j) {
all_features <- cbind(combined$rt[sel], combined$rt[sel + 1])
flip_indices <- which(combined$sample_id[sel] == j)
Expand All @@ -34,6 +37,7 @@ compute_template_adjusted_rt <- function(combined, sel, j) {
return(all_features)
}

#' @export
compute_corrected_features <- function(features, delta_rt, avg_time) {
features <- features[order(features$rt, features$mz), ]
corrected <- features$rt
Expand All @@ -58,6 +62,7 @@ compute_corrected_features <- function(features, delta_rt, avg_time) {
return(features)
}

#' @export
fill_missing_values <- function(orig.feature, this.feature) {
missing_values <- which(is.na(this.feature$rt))
for (i in missing_values) {
Expand Down
115 changes: 91 additions & 24 deletions R/compute_clusters.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,21 @@
#' @description
#' Sort tibble based on sample_names
#' @export
sort_data <- function(sample_names, feature_tables){
sort_data <- function(sample_names, feature_tables) {
index <- c()
for (i in seq_along(sample_names))
{
index <- append(index, feature_tables[[i]]$sample_id[1])
index <- append(index, feature_tables[[i]]$sample_id[1])
}

index <- match(sample_names, index)
feature_tables <- feature_tables[index]

return(feature_tables)
}

#' Compute clusters of mz and rt and assign cluster id to individual features.
#'
#'
#' @description
#' Uses tolerances to group features with mz and rt within the tolerance into clusters,
#' creating larger features from raw data points. Custom tolerances for mz and rt are
Expand All @@ -34,7 +34,7 @@ sort_data <- function(sample_names, feature_tables){
#' \item feature_tables - list - Feature tables with added columns [sample_id, cluster].
#' \item rt_tol_relative - float - Newly determined relative rt tolerance.
#' \item mz_tol_relative - float - Newly determined relative mz tolerance.
#'}
#' }
#' @export
compute_clusters <- function(feature_tables,
mz_tol_relative,
Expand All @@ -45,9 +45,9 @@ compute_clusters <- function(feature_tables,
sample_names = NA) {
number_of_samples <- length(feature_tables)
all <- concatenate_feature_tables(feature_tables, sample_names)

if (is.na(mz_tol_relative)) {
mz_tol_relative <- find.tol(
mz_tol_relative <- find_mz_tolerance(
all$mz,
mz_max_diff = mz_max_diff,
aver.bin.size = 4000,
Expand All @@ -74,37 +74,104 @@ compute_clusters <- function(feature_tables,
)
}

res <- find.tol.time(
all,
number_of_samples = number_of_samples,
mz_tol_relative = mz_tol_relative,
rt_tol_relative = rt_tol_relative,
aver.bin.size = 200,
min.bins = 50,
max.bins = 100,
mz_tol_absolute = mz_tol_absolute,
max.num.segments = 10000,
do.plot = do.plot
aver.bin.size <- 200
min.bins <- 50
max.bins <- 100
max.num.segments <- 10000

features <- dplyr::arrange_at(all, "mz")
min_mz_tol <- compute_min_mz_tolerance(
features$mz,
mz_tol_relative,
mz_tol_absolute
)

rt_tol_relative <- res$rt.tol
mz_breaks <- compute_breaks_3(features$mz, min_mz_tol)
features$mz_group <- 0

for (i in 2:length(mz_breaks)) {
subset_indices <- (mz_breaks[i - 1] + 1):mz_breaks[i]
features$mz_group[subset_indices] <- i
}

features <- features |> dplyr::arrange_at(c("mz_group", "rt"))

mz_breaks <- mz_breaks[c(-1, -length(mz_breaks))]

if (is.na(rt_tol_relative)) {
rt_tol_relative <- compute_rt_tol_relative(
mz_breaks,
max.num.segments,
aver.bin.size,
number_of_samples,
features$rt,
min.bins,
max.bins
)
}

# compute breaks in rt domain
rt_diffs <- diff(features$rt)
rt_breaks <- which(rt_diffs > rt_tol_relative)

# combine indices of all breaks in array and sort
all.breaks <- c(0, unique(c(mz_breaks, rt_breaks)), nrow(features))
all.breaks <- all.breaks[order(all.breaks)]

features$cluster <- 0
for (i in 2:length(all.breaks)) {
features$cluster[(all.breaks[i - 1] + 1):all.breaks[i]] <- i
}

message("**** performing time correction ****")
message(paste("m/z tolerance level: ", mz_tol_relative))
message(paste("time tolerance level:", rt_tol_relative))

# Select features from individual samples, sort by mz and rt and
# Select features from individual samples, sort by mz and rt and
# return the sorted tables as individual tibbles.
feature_tables <- res$features |>
feature_tables <- features |>
dplyr::select(-mz_group) |>
dplyr::group_by(sample_id) |>
dplyr::arrange_at(c("mz", "rt")) |>
dplyr::group_split()

feature_tables <- sort_data(sample_names, feature_tables)

return(list(feature_tables = feature_tables, rt_tol_relative = rt_tol_relative, mz_tol_relative = mz_tol_relative))
}

#' Compute clusters using simple grouping based on numeric thresholds.
#'
#' @describtion
#' Features are first grouped in mz dimension based on the tolerance.
#' First, the absolute tolerance is computed for each feature, then a new group is started
#' once the difference between consecutive features is above this threshold.
#' The same process is then repeated for the retention time dimension.
#' The individual indices are then combined into a single index in the `cluster` columns.
#' @param feature_tables list of tibbles feature tables coming from all samples.
#' @param sample_names list of strings Sample names of the feature tables used to distinguish the samples.
#' @param mz_tol_ppm float Relative tolerance for mz grouping in parts per million.
#' @param rt_tol float Tolerance in retention time dimension [seconds].
#' @return list of tibbles Feature tables passed initially with additional columns indicating the
#' mz and rt groups as well as the combined cluster index.
#' @export
compute_clusters_simple <- function(feature_tables, sample_names, mz_tol_ppm, rt_tol) {
all <- concatenate_feature_tables(feature_tables, sample_names) |> dplyr::arrange_at("mz")

mz_tol_rel <- mz_tol_ppm * 1e-06
mz_tol_abs <- all$mz * mz_tol_rel

all |>
dplyr::mutate(mz_group = cumsum(c(0, diff(mz)) > mz_tol_abs)) |>
dplyr::group_by(mz_group) |>
dplyr::arrange_at("rt") |>
dplyr::mutate(rt_group = cumsum(c(0, diff(rt)) > rt_tol)) |>
dplyr::group_by(mz_group, rt_group) |>
dplyr::mutate(cluster = cur_group_id()) |>
dplyr::ungroup() |>
dplyr::arrange_at("cluster") |>
dplyr::group_by(sample_id) |>
dplyr::group_split()
}


# compute_clusters_v2 <- function(feature_tables, mz_tol_ppm, rt_tol) {
Expand Down
8 changes: 5 additions & 3 deletions R/feature.align.R
Original file line number Diff line number Diff line change
Expand Up @@ -134,11 +134,13 @@ create_aligned_feature_table <- function(features_table,
if (!is(cluster, "cluster")) {
cluster <- parallel::makeCluster(cluster)
on.exit(parallel::stopCluster(cluster))

# NOTE: side effect (doParallel has no functionality to clean up)
doParallel::registerDoParallel(cluster)
register_functions_to_cluster(cluster)
}

# NOTE: side effect (doParallel has no functionality to clean up)
doParallel::registerDoParallel(cluster)
register_functions_to_cluster(cluster)


number_of_samples <- length(sample_names)
metadata_colnames <- c("id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", sample_names)
Expand Down
Loading