Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use proper sample IDs inside feature tables #153

Merged
merged 14 commits into from
Oct 25, 2022
Merged
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- refactored `find.tol.time.R` [#91](https://github.com/RECETOX/recetox-aplcms/pull/91)
- refactored `find.turn.point.R` [#91](https://github.com/RECETOX/recetox-aplcms/pull/91)
- refactored `proc.cdf.R` and `adaptive.bin.R` [#137](https://github.com/RECETOX/recetox-aplcms/pull/137)
- use proper sample IDs inside feature tables [#153](https://github.com/RECETOX/recetox-aplcms/pull/153)
### Removed

## [0.9.4] - 2022-05-10
Expand Down
12 changes: 4 additions & 8 deletions R/adjust.time.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,7 @@ compute_template_adjusted_rt <- function(combined, sel, j) {

# now the first column is the template retention time.
# the second column is the to-be-adjusted retention time

cat(c("sample", j, "using", nrow(all_features), ", "))
if (j %% 3 == 0) {
cat("\n")
}


all_features <- all_features[order(all_features[, 2]), ]
return(all_features)
}
Expand Down Expand Up @@ -77,10 +72,11 @@ fill_missing_values <- function(orig.feature, this.feature) {

compute_template <- function(extracted_features) {
num.ftrs <- sapply(extracted_features, nrow)
template <- which.max(num.ftrs)
template_id <- which.max(num.ftrs)
template <- extracted_features[[template_id]]$sample_id[1]
message(paste("the template is sample", template))

candi <- tibble::as_tibble(extracted_features[[template]]) |> dplyr::select(c(mz, rt))
candi <- tibble::as_tibble(extracted_features[[template_id]]) |> dplyr::select(c(mz, rt))
template_features <- dplyr::bind_cols(candi, sample_id = rep(template, nrow(candi)))
return(tibble::as_tibble(template_features))
}
Expand Down
8 changes: 5 additions & 3 deletions R/compute_clusters.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#' @param mz_max_diff float Maximum difference between featuure mz values to belong to the same cluster.
#' @param rt_tol_relative float Relative retention time tolerance to use for grouping features.
#' @param do.plot bool Plot graphics or not.
#' @param sample_names list List of sample names.
#' @return Returns a list with following items:
#' \itemize{
#' \item feature_tables - list - Feature tables with added columns [sample_id, cluster].
Expand All @@ -23,10 +24,11 @@ compute_clusters <- function(feature_tables,
mz_tol_absolute,
mz_max_diff,
rt_tol_relative,
do.plot = FALSE) {
do.plot = FALSE,
sample_names = NA) {
number_of_samples <- length(feature_tables)
all <- concatenate_feature_tables(feature_tables)

all <- concatenate_feature_tables(feature_tables, sample_names)
if (is.na(mz_tol_relative)) {
mz_tol_relative <- find.tol(
all$mz,
Expand Down
47 changes: 26 additions & 21 deletions R/feature.align.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,14 @@ add_row <- function(df, data, i, column_names) {
}


create_output <- function(sample_grouped, number_of_samples) {
create_output <- function(sample_grouped, sample_names) {
number_of_samples <- length(sample_names)
intensity_row <- rep(0, number_of_samples)
rt_row <- rep(0, number_of_samples)
sample_presence <- rep(0, number_of_samples)

for (i in seq_along(intensity_row)) {
filtered <- filter(sample_grouped, sample_id == i)
filtered <- filter(sample_grouped, sample_id == sample_names[i])
if (nrow(filtered) != 0) {
sample_presence[i] <- 1
intensity_row[i] <- sum(filtered$area)
Expand Down Expand Up @@ -68,23 +69,23 @@ filter_based_on_density <- function(sample, turns, index, i) {
}


select_rt <- function(sample, rt_tol_relative, min_occurrence, number_of_samples) {
select_rt <- function(sample, rt_tol_relative, min_occurrence, sample_names) {
turns <- find_optima(sample$rt, bandwidth = rt_tol_relative / 1.414)
for (i in seq_along(turns$peaks)) {
sample_grouped <- filter_based_on_density(sample, turns, 2, i)
if (validate_contents(sample_grouped, min_occurrence)) {
return(create_output(sample_grouped, number_of_samples))
return(create_output(sample_grouped, sample_names))
}
}
}


select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence, number_of_samples) {
select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names) {
turns <- find_optima(sample$mz, bandwidth = mz_tol_relative * median(sample$mz))
for (i in seq_along(turns$peaks)) {
sample_grouped <- filter_based_on_density(sample, turns, 1, i)
if (validate_contents(sample_grouped, min_occurrence)) {
return(select_rt(sample_grouped, rt_tol_relative, min_occurrence, number_of_samples))
return(select_rt(sample_grouped, rt_tol_relative, min_occurrence, sample_names))
}
}
}
Expand All @@ -96,32 +97,33 @@ create_rows <- function(features,
mz_tol_relative,
rt_tol_relative,
min_occurrence,
number_of_samples) {
sample_names) {
if (i %% 100 == 0) {
gc()
} # call Garbage Collection for performance improvement?

sample <- dplyr::filter(features, cluster == sel.labels[i])
if (nrow(sample) > 1) {
if (validate_contents(sample, min_occurrence)) {
return(select_mz(sample, mz_tol_relative, rt_tol_relative, min_occurrence, number_of_samples))
return(select_mz(sample, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names))
}
} else if (min_occurrence == 1) {
return(create_output(sample_grouped, number_of_samples))
return(create_output(sample_grouped, sample_names))
}
return(NULL)
}


create_aligned_feature_table <- function(all_table,
min_occurrence,
number_of_samples,
sample_names,
rt_tol_relative,
mz_tol_relative) {

metadata_colnames <- c("id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", paste0("sample_", 1:number_of_samples))
intensity_colnames <- c("id", paste0("sample_", 1:number_of_samples, "_intensity"))
rt_colnames <- c("id", paste0("sample_", 1:number_of_samples, "_rt"))
number_of_samples <- length(sample_names)
metadata_colnames <- c("id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", sample_names)
intensity_colnames <- c("id", paste0(sample_names, "_intensity"))
rt_colnames <- c("id", paste0(sample_names, "_rt"))

aligned_features <- create_empty_tibble(number_of_samples, metadata_colnames, intensity_colnames, rt_colnames)

Expand All @@ -139,7 +141,7 @@ create_aligned_feature_table <- function(all_table,
mz_tol_relative,
rt_tol_relative,
min_occurrence,
number_of_samples
sample_names
)

if (!is.null(rows)) {
Expand Down Expand Up @@ -168,6 +170,7 @@ create_aligned_feature_table <- function(all_table,
#' when the m/z range is wide. This parameter limits the tolerance in absolute terms. It mostly
#' influences feature matching in higher m/z range.
#' @param do.plot Indicates whether plot should be drawn.
#' @param sample_names list List of sample names.
#' @return Returns a list object with the following objects in it:
#' \itemize{
#' \item aligned.ftrs - A matrix, with columns of m/z values, elution times, signal strengths in each spectrum.
Expand All @@ -178,37 +181,39 @@ create_aligned_feature_table <- function(all_table,
#' @export
#' @examples
#' data(extracted)
#' feature.align(extracted, mz_max_diff = 10 * 1e-05, do.plot = FALSE)
#' feature.align(extracted, mz_max_diff = 10 * 1e-05, do.plot = FALSE, sample_names = c("s1", "s2", "s3"))
feature.align <- function(features,
min_occurrence = 2,
mz_tol_relative = NA,
rt_tol_relative = NA,
mz_max_diff = 1e-4,
mz_tol_absolute = 0.01,
do.plot = TRUE) {
do.plot = TRUE,
sample_names = NA) {
if (do.plot) {
par(mfrow = c(3, 2))
draw_plot(label = "Feature alignment", cex = 2)
draw_plot()
}



number_of_samples <- length(features)
if (number_of_samples > 1) {
res <- compute_clusters(
features,
mz_tol_relative,
mz_tol_absolute,
mz_max_diff,
rt_tol_relative
rt_tol_relative,
do.plot,
sample_names
)

all_table <- dplyr::bind_rows(res$feature_tables)

aligned_features <- create_aligned_feature_table(
all_table,
min_occurrence,
number_of_samples,
sample_names,
res$rt_tol_relative,
res$mz_tol_relative
)
Expand Down
7 changes: 4 additions & 3 deletions R/hybrid.R
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,8 @@ hybrid <- function(
mz_tol_relative = align_mz_tol,
mz_tol_absolute = max_align_mz_diff,
mz_max_diff = 10 * mz_tol,
rt_tol_relative = align_rt_tol
rt_tol_relative = align_rt_tol,
sample_names = sample_names
)

message("**** computing template ****")
Expand Down Expand Up @@ -318,7 +319,7 @@ hybrid <- function(
aligned <- create_aligned_feature_table(
dplyr::bind_rows(adjusted_clusters$feature_tables),
min_exp,
number_of_samples,
sample_names,
adjusted_clusters$rt_tol_relative,
adjusted_clusters$mz_tol_relative
)
Expand Down Expand Up @@ -392,7 +393,7 @@ hybrid <- function(
recovered_aligned <- create_aligned_feature_table(
dplyr::bind_rows(adjusted_clusters$feature_tables),
min_exp,
number_of_samples,
sample_names,
adjusted_clusters$rt_tol_relative,
adjusted_clusters$mz_tol_relative
)
Expand Down
3 changes: 1 addition & 2 deletions R/recover.weaker.R
Original file line number Diff line number Diff line change
Expand Up @@ -669,7 +669,6 @@ recover.weaker <- function(filename,
bandwidth = .5,
recover.min.count = 3,
intensity.weighted = FALSE) {

# load raw data
data_table <- load_file(filename) |> dplyr::arrange_at("mz")
times <- sort(unique(data_table$rt))
Expand Down Expand Up @@ -778,7 +777,7 @@ recover.weaker <- function(filename,
mz = this.rec$mz[this.sel],
rt = this.rec$rt[this.sel] + this.time.adjust,
area = this.rec$intensities[this.sel],
sample_id = grep(sample_name, colnames(metadata_table)) - 8 # offset for other columns `mz`, `rt` etc
sample_id = sample_name
)
}
}
Expand Down
9 changes: 6 additions & 3 deletions R/semi.sup.R
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,8 @@ semi.sup <- function(
match.tol.ppm=NA,
new.feature.min.count=2,
recover.min.count=3,
intensity.weighted=FALSE)
intensity.weighted=FALSE,
sample_names = NA)
{
setwd(folder)
files<-files[order(files)]
Expand Down Expand Up @@ -243,7 +244,8 @@ semi.sup <- function(
mz_tol_relative = align.mz.tol,
rt_tol_relative = align.rt.tol,
mz_max_diff = 10 * mz.tol,
mz_tol_absolute = max.align.mz.diff
mz_tol_absolute = max.align.mz.diff,
sample_names = sample_names
)
)

Expand Down Expand Up @@ -456,7 +458,8 @@ semi.sup <- function(
mz_tol_relative = align.mz.tol,
rt_tol_relative = align.rt.tol,
mz_max_diff = 10 * mz.tol,
mz_tol_absolute = max.align.mz.diff
mz_tol_absolute = max.align.mz.diff,
sample_names = sample_names
)
)

Expand Down
5 changes: 4 additions & 1 deletion R/two.step.hybrid.R
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,8 @@ two.step.hybrid <- function(filenames,
batches_idx <- unique(metadata$batch)
batchwise <- new("list")
message("* processing ", length(batches_idx), " batches separately")

sample_names <- get_sample_name(filenames)

for (batch.i in batches_idx) {
files_batch <- dplyr::filter(filenames_batchwise, batch == batch.i)$filename
Expand Down Expand Up @@ -391,7 +393,8 @@ two.step.hybrid <- function(filenames,
use.observed.range = use.observed.range,
shape.model = shape.model,
new.feature.min.count = new.feature.min.count,
recover.min.count = recover.min.count
recover.min.count = recover.min.count,
sample_names = sample_names
)

features$final.ftrs <- features$final.ftrs[order(features$final.ftrs[, 1], features$final.ftrs[, 2]), ]
Expand Down
8 changes: 5 additions & 3 deletions R/unsupervised.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ sort_samples_by_acquisition_number <- function (filenames) {
}

align_features <- function(sample_names, ...) {
# if this will be used in Galaxy wrapper, needs to be fixed (pass also sample_names)
aligned <- feature.align(...)

list(
Expand Down Expand Up @@ -242,7 +243,8 @@ unsupervised <- function(
mz_tol_relative = align_mz_tol,
mz_tol_absolute = max_align_mz_diff,
mz_max_diff = 10 * mz_tol,
rt_tol_relative = align_rt_tol
rt_tol_relative = align_rt_tol,
sample_names = sample_names
)

message("**** computing template ****")
Expand Down Expand Up @@ -270,7 +272,7 @@ unsupervised <- function(
aligned <- create_aligned_feature_table(
dplyr::bind_rows(adjusted_clusters$feature_tables),
min_exp,
number_of_samples,
sample_names,
adjusted_clusters$rt_tol_relative,
adjusted_clusters$mz_tol_relative
)
Expand Down Expand Up @@ -314,7 +316,7 @@ unsupervised <- function(
recovered_aligned <- create_aligned_feature_table(
dplyr::bind_rows(recovered_clusters$feature_tables),
min_exp,
number_of_samples,
sample_names,
recovered_clusters$rt_tol_relative,
recovered_clusters$mz_tol_relative
)
Expand Down
Loading