Skip to content

Commit

Permalink
Merge pull request #153 from xtrojak/clean_up_filenames
Browse files Browse the repository at this point in the history
Use proper sample IDs inside feature tables
  • Loading branch information
hechth authored Oct 25, 2022
2 parents 6029737 + 8edd88d commit 5ac72f5
Show file tree
Hide file tree
Showing 14 changed files with 149 additions and 130 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- refactored `find.tol.time.R` [#91](https://github.com/RECETOX/recetox-aplcms/pull/91)
- refactored `find.turn.point.R` [#91](https://github.com/RECETOX/recetox-aplcms/pull/91)
- refactored `proc.cdf.R` and `adaptive.bin.R` [#137](https://github.com/RECETOX/recetox-aplcms/pull/137)
- use proper sample IDs inside feature tables [#153](https://github.com/RECETOX/recetox-aplcms/pull/153)
### Removed

## [0.9.4] - 2022-05-10
Expand Down
12 changes: 4 additions & 8 deletions R/adjust.time.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,7 @@ compute_template_adjusted_rt <- function(combined, sel, j) {

# now the first column is the template retention time.
# the second column is the to-be-adjusted retention time

cat(c("sample", j, "using", nrow(all_features), ", "))
if (j %% 3 == 0) {
cat("\n")
}


all_features <- all_features[order(all_features[, 2]), ]
return(all_features)
}
Expand Down Expand Up @@ -77,10 +72,11 @@ fill_missing_values <- function(orig.feature, this.feature) {

compute_template <- function(extracted_features) {
num.ftrs <- sapply(extracted_features, nrow)
template <- which.max(num.ftrs)
template_id <- which.max(num.ftrs)
template <- extracted_features[[template_id]]$sample_id[1]
message(paste("the template is sample", template))

candi <- tibble::as_tibble(extracted_features[[template]]) |> dplyr::select(c(mz, rt))
candi <- tibble::as_tibble(extracted_features[[template_id]]) |> dplyr::select(c(mz, rt))
template_features <- dplyr::bind_cols(candi, sample_id = rep(template, nrow(candi)))
return(tibble::as_tibble(template_features))
}
Expand Down
8 changes: 5 additions & 3 deletions R/compute_clusters.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#' @param mz_max_diff float Maximum difference between featuure mz values to belong to the same cluster.
#' @param rt_tol_relative float Relative retention time tolerance to use for grouping features.
#' @param do.plot bool Plot graphics or not.
#' @param sample_names list List of sample names.
#' @return Returns a list with following items:
#' \itemize{
#' \item feature_tables - list - Feature tables with added columns [sample_id, cluster].
Expand All @@ -23,10 +24,11 @@ compute_clusters <- function(feature_tables,
mz_tol_absolute,
mz_max_diff,
rt_tol_relative,
do.plot = FALSE) {
do.plot = FALSE,
sample_names = NA) {
number_of_samples <- length(feature_tables)
all <- concatenate_feature_tables(feature_tables)

all <- concatenate_feature_tables(feature_tables, sample_names)
if (is.na(mz_tol_relative)) {
mz_tol_relative <- find.tol(
all$mz,
Expand Down
47 changes: 26 additions & 21 deletions R/feature.align.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,14 @@ add_row <- function(df, data, i, column_names) {
}


create_output <- function(sample_grouped, number_of_samples) {
create_output <- function(sample_grouped, sample_names) {
number_of_samples <- length(sample_names)
intensity_row <- rep(0, number_of_samples)
rt_row <- rep(0, number_of_samples)
sample_presence <- rep(0, number_of_samples)

for (i in seq_along(intensity_row)) {
filtered <- filter(sample_grouped, sample_id == i)
filtered <- filter(sample_grouped, sample_id == sample_names[i])
if (nrow(filtered) != 0) {
sample_presence[i] <- 1
intensity_row[i] <- sum(filtered$area)
Expand Down Expand Up @@ -68,23 +69,23 @@ filter_based_on_density <- function(sample, turns, index, i) {
}


select_rt <- function(sample, rt_tol_relative, min_occurrence, number_of_samples) {
select_rt <- function(sample, rt_tol_relative, min_occurrence, sample_names) {
turns <- find_optima(sample$rt, bandwidth = rt_tol_relative / 1.414)
for (i in seq_along(turns$peaks)) {
sample_grouped <- filter_based_on_density(sample, turns, 2, i)
if (validate_contents(sample_grouped, min_occurrence)) {
return(create_output(sample_grouped, number_of_samples))
return(create_output(sample_grouped, sample_names))
}
}
}


select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence, number_of_samples) {
select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names) {
turns <- find_optima(sample$mz, bandwidth = mz_tol_relative * median(sample$mz))
for (i in seq_along(turns$peaks)) {
sample_grouped <- filter_based_on_density(sample, turns, 1, i)
if (validate_contents(sample_grouped, min_occurrence)) {
return(select_rt(sample_grouped, rt_tol_relative, min_occurrence, number_of_samples))
return(select_rt(sample_grouped, rt_tol_relative, min_occurrence, sample_names))
}
}
}
Expand All @@ -96,32 +97,33 @@ create_rows <- function(features,
mz_tol_relative,
rt_tol_relative,
min_occurrence,
number_of_samples) {
sample_names) {
if (i %% 100 == 0) {
gc()
} # call Garbage Collection for performance improvement?

sample <- dplyr::filter(features, cluster == sel.labels[i])
if (nrow(sample) > 1) {
if (validate_contents(sample, min_occurrence)) {
return(select_mz(sample, mz_tol_relative, rt_tol_relative, min_occurrence, number_of_samples))
return(select_mz(sample, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names))
}
} else if (min_occurrence == 1) {
return(create_output(sample_grouped, number_of_samples))
return(create_output(sample_grouped, sample_names))
}
return(NULL)
}


create_aligned_feature_table <- function(all_table,
min_occurrence,
number_of_samples,
sample_names,
rt_tol_relative,
mz_tol_relative) {

metadata_colnames <- c("id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", paste0("sample_", 1:number_of_samples))
intensity_colnames <- c("id", paste0("sample_", 1:number_of_samples, "_intensity"))
rt_colnames <- c("id", paste0("sample_", 1:number_of_samples, "_rt"))
number_of_samples <- length(sample_names)
metadata_colnames <- c("id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", sample_names)
intensity_colnames <- c("id", paste0(sample_names, "_intensity"))
rt_colnames <- c("id", paste0(sample_names, "_rt"))

aligned_features <- create_empty_tibble(number_of_samples, metadata_colnames, intensity_colnames, rt_colnames)

Expand All @@ -139,7 +141,7 @@ create_aligned_feature_table <- function(all_table,
mz_tol_relative,
rt_tol_relative,
min_occurrence,
number_of_samples
sample_names
)

if (!is.null(rows)) {
Expand Down Expand Up @@ -168,6 +170,7 @@ create_aligned_feature_table <- function(all_table,
#' when the m/z range is wide. This parameter limits the tolerance in absolute terms. It mostly
#' influences feature matching in higher m/z range.
#' @param do.plot Indicates whether plot should be drawn.
#' @param sample_names list List of sample names.
#' @return Returns a list object with the following objects in it:
#' \itemize{
#' \item aligned.ftrs - A matrix, with columns of m/z values, elution times, signal strengths in each spectrum.
Expand All @@ -178,37 +181,39 @@ create_aligned_feature_table <- function(all_table,
#' @export
#' @examples
#' data(extracted)
#' feature.align(extracted, mz_max_diff = 10 * 1e-05, do.plot = FALSE)
#' feature.align(extracted, mz_max_diff = 10 * 1e-05, do.plot = FALSE, sample_names = c("s1", "s2", "s3"))
feature.align <- function(features,
min_occurrence = 2,
mz_tol_relative = NA,
rt_tol_relative = NA,
mz_max_diff = 1e-4,
mz_tol_absolute = 0.01,
do.plot = TRUE) {
do.plot = TRUE,
sample_names = NA) {
if (do.plot) {
par(mfrow = c(3, 2))
draw_plot(label = "Feature alignment", cex = 2)
draw_plot()
}



number_of_samples <- length(features)
if (number_of_samples > 1) {
res <- compute_clusters(
features,
mz_tol_relative,
mz_tol_absolute,
mz_max_diff,
rt_tol_relative
rt_tol_relative,
do.plot,
sample_names
)

all_table <- dplyr::bind_rows(res$feature_tables)

aligned_features <- create_aligned_feature_table(
all_table,
min_occurrence,
number_of_samples,
sample_names,
res$rt_tol_relative,
res$mz_tol_relative
)
Expand Down
7 changes: 4 additions & 3 deletions R/hybrid.R
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,8 @@ hybrid <- function(
mz_tol_relative = align_mz_tol,
mz_tol_absolute = max_align_mz_diff,
mz_max_diff = 10 * mz_tol,
rt_tol_relative = align_rt_tol
rt_tol_relative = align_rt_tol,
sample_names = sample_names
)

message("**** computing template ****")
Expand Down Expand Up @@ -318,7 +319,7 @@ hybrid <- function(
aligned <- create_aligned_feature_table(
dplyr::bind_rows(adjusted_clusters$feature_tables),
min_exp,
number_of_samples,
sample_names,
adjusted_clusters$rt_tol_relative,
adjusted_clusters$mz_tol_relative
)
Expand Down Expand Up @@ -392,7 +393,7 @@ hybrid <- function(
recovered_aligned <- create_aligned_feature_table(
dplyr::bind_rows(adjusted_clusters$feature_tables),
min_exp,
number_of_samples,
sample_names,
adjusted_clusters$rt_tol_relative,
adjusted_clusters$mz_tol_relative
)
Expand Down
3 changes: 1 addition & 2 deletions R/recover.weaker.R
Original file line number Diff line number Diff line change
Expand Up @@ -669,7 +669,6 @@ recover.weaker <- function(filename,
bandwidth = .5,
recover.min.count = 3,
intensity.weighted = FALSE) {

# load raw data
data_table <- load_file(filename) |> dplyr::arrange_at("mz")
times <- sort(unique(data_table$rt))
Expand Down Expand Up @@ -778,7 +777,7 @@ recover.weaker <- function(filename,
mz = this.rec$mz[this.sel],
rt = this.rec$rt[this.sel] + this.time.adjust,
area = this.rec$intensities[this.sel],
sample_id = grep(sample_name, colnames(metadata_table)) - 8 # offset for other columns `mz`, `rt` etc
sample_id = sample_name
)
}
}
Expand Down
9 changes: 6 additions & 3 deletions R/semi.sup.R
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,8 @@ semi.sup <- function(
match.tol.ppm=NA,
new.feature.min.count=2,
recover.min.count=3,
intensity.weighted=FALSE)
intensity.weighted=FALSE,
sample_names = NA)
{
setwd(folder)
files<-files[order(files)]
Expand Down Expand Up @@ -243,7 +244,8 @@ semi.sup <- function(
mz_tol_relative = align.mz.tol,
rt_tol_relative = align.rt.tol,
mz_max_diff = 10 * mz.tol,
mz_tol_absolute = max.align.mz.diff
mz_tol_absolute = max.align.mz.diff,
sample_names = sample_names
)
)

Expand Down Expand Up @@ -456,7 +458,8 @@ semi.sup <- function(
mz_tol_relative = align.mz.tol,
rt_tol_relative = align.rt.tol,
mz_max_diff = 10 * mz.tol,
mz_tol_absolute = max.align.mz.diff
mz_tol_absolute = max.align.mz.diff,
sample_names = sample_names
)
)

Expand Down
5 changes: 4 additions & 1 deletion R/two.step.hybrid.R
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,8 @@ two.step.hybrid <- function(filenames,
batches_idx <- unique(metadata$batch)
batchwise <- new("list")
message("* processing ", length(batches_idx), " batches separately")

sample_names <- get_sample_name(filenames)

for (batch.i in batches_idx) {
files_batch <- dplyr::filter(filenames_batchwise, batch == batch.i)$filename
Expand Down Expand Up @@ -391,7 +393,8 @@ two.step.hybrid <- function(filenames,
use.observed.range = use.observed.range,
shape.model = shape.model,
new.feature.min.count = new.feature.min.count,
recover.min.count = recover.min.count
recover.min.count = recover.min.count,
sample_names = sample_names
)

features$final.ftrs <- features$final.ftrs[order(features$final.ftrs[, 1], features$final.ftrs[, 2]), ]
Expand Down
8 changes: 5 additions & 3 deletions R/unsupervised.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ sort_samples_by_acquisition_number <- function (filenames) {
}

align_features <- function(sample_names, ...) {
# if this will be used in Galaxy wrapper, needs to be fixed (pass also sample_names)
aligned <- feature.align(...)

list(
Expand Down Expand Up @@ -242,7 +243,8 @@ unsupervised <- function(
mz_tol_relative = align_mz_tol,
mz_tol_absolute = max_align_mz_diff,
mz_max_diff = 10 * mz_tol,
rt_tol_relative = align_rt_tol
rt_tol_relative = align_rt_tol,
sample_names = sample_names
)

message("**** computing template ****")
Expand Down Expand Up @@ -270,7 +272,7 @@ unsupervised <- function(
aligned <- create_aligned_feature_table(
dplyr::bind_rows(adjusted_clusters$feature_tables),
min_exp,
number_of_samples,
sample_names,
adjusted_clusters$rt_tol_relative,
adjusted_clusters$mz_tol_relative
)
Expand Down Expand Up @@ -314,7 +316,7 @@ unsupervised <- function(
recovered_aligned <- create_aligned_feature_table(
dplyr::bind_rows(recovered_clusters$feature_tables),
min_exp,
number_of_samples,
sample_names,
recovered_clusters$rt_tol_relative,
recovered_clusters$mz_tol_relative
)
Expand Down
Loading

0 comments on commit 5ac72f5

Please sign in to comment.