diff --git a/.github/workflows/r-conda.yml b/.github/workflows/r-conda.yml index 0bd68ef0..1994c81a 100644 --- a/.github/workflows/r-conda.yml +++ b/.github/workflows/r-conda.yml @@ -54,8 +54,6 @@ jobs: run: wget -P tests/testdata/recovered/recovered-corrected -i tests/remote-files/recovered-corrected.txt - name: Fetch filtered test data run: wget -P tests/testdata/filtered -i tests/remote-files/filtered.txt - - name: Fetch run_filter test data - run: wget -P tests/testdata/filtered/run_filter -i tests/remote-files/run_filter.txt - name: Fetch features test data run: wget -P tests/testdata/features -i tests/remote-files/features.txt - name: Fetch clusters test data diff --git a/CHANGELOG.md b/CHANGELOG.md index 911a9607..9f8846df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,14 +11,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - added tests for `feature.align.R` ([#40](https://github.com/RECETOX/recetox-aplcms/pull/40)), and `adjust.time.R` ([#39](https://github.com/RECETOX/recetox-aplcms/pull/40)) - added CI to repository's GitHub Actions [#45](https://github.com/RECETOX/recetox-aplcms/pull/45),[#49](https://github.com/RECETOX/recetox-aplcms/pull/49) - added additional test cases for hybrid [#133](https://github.com/RECETOX/recetox-aplcms/pull/133) -- added tests and testdata for run_filter.R [#156](https://github.com/RECETOX/recetox-aplcms/pull/156) ### Changed - refactored `feature.align.R` [#63](https://github.com/RECETOX/recetox-aplcms/pull/63)[#88](https://github.com/RECETOX/recetox-aplcms/pull/88)[#102](https://github.com/RECETOX/recetox-aplcms/pull/102) - refactored `adjust.time.R` [#64](https://github.com/RECETOX/recetox-aplcms/pull/64)[#102](https://github.com/RECETOX/recetox-aplcms/pull/102) - refactored `find.tol.time.R` [#91](https://github.com/RECETOX/recetox-aplcms/pull/91) - refactored `find.turn.point.R` [#91](https://github.com/RECETOX/recetox-aplcms/pull/91) - refactored `proc.cdf.R` and `adaptive.bin.R` [#137](https://github.com/RECETOX/recetox-aplcms/pull/137) -- refactored `cont.index.R` and renamed as `run_filter.R` [#156](https://github.com/RECETOX/recetox-aplcms/pull/156) - use proper sample IDs inside feature tables [#153](https://github.com/RECETOX/recetox-aplcms/pull/153) ### Removed diff --git a/NAMESPACE b/NAMESPACE index 854995b2..97699d24 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -28,6 +28,7 @@ export(compute_mz_sd) export(compute_scale) export(compute_start_bound) export(compute_target_times) +export(cont.index) export(draw_rt_correction_plot) export(draw_rt_normal_peaks) export(duplicate.row.remove) @@ -67,7 +68,6 @@ export(prof.to.features) export(recover.weaker) export(rev_cum_sum) export(rm.ridge) -export(run_filter) export(semi.sup) export(sort_samples_by_acquisition_number) export(span) diff --git a/R/cont.index.R b/R/cont.index.R new file mode 100644 index 00000000..8fc335f2 --- /dev/null +++ b/R/cont.index.R @@ -0,0 +1,127 @@ +#' Continuity index +#' +#' This is an internal function. It uses continuity index (or "run filter") to select putative peaks from EIC. +#' +#' @param newprof The matrix containing m/z, retention time, intensity, and EIC label as columns. +#' @param min.pres Run filter parameter. The minimum proportion of presence in the time period for a series of signals grouped +#' by m/z to be considered a peak. +#' @param min.run Run filter parameter. The minimum length of elution time for a series of signals grouped by m/z to be considered a peak. +#' @return A list is returned. +#' \itemize{ +#' \item new.rec - The matrix containing m/z, retention time, intensity, and EIC label as columns after applying the run filter. +#' \item height.rec - The vector of peak heights. +#' \item time.range.rec - The vector of peak retention time span. +#' \item mz.pres.rec - The vector of proportion of non-missing m/z. +#' } +#' @export +#' @examples +#' cont.index(newprof, min.pres = min.pres, min.run = min.run) +cont.index <- function(newprof, min.pres = 0.6, min.run = 5) { + collapse <- function(a) { + a <- a[order(a[, 2]), ] + a.breaks <- compute_breaks_3(a[, 2]) + + newa <- c(0, 0, 0, 0) + for (i in 2:length(a.breaks)) + { + sel <- (a.breaks[i - 1] + 1):a.breaks[i] + newa <- rbind(newa, c(median(a[a.breaks[i], 1]), a[a.breaks[i], 2], sum(a[sel, 3]), a[a.breaks[i], 4])) + } + + return(newa[-1, ]) + } + + labels <- newprof[, 2] + times <- unique(labels) + times <- times[order(times)] + time.points <- length(times) + + for (i in 1:length(times)) labels[which(newprof[, 2] == times[i])] <- i # now labels is the index of time points + newprof[, 2] <- labels + + times <- times[order(times)] + l <- nrow(newprof) + timeline <- rep(0, time.points) + i <- 1 + num.stack <- 1 + min.count.run <- min.run * time.points / (max(times) - min(times)) + aver.time.range <- (max(times) - min(times)) / time.points + + grps <- newprof[, 4] + uniq.grp <- unique(grps) + curr.label <- 1 + + ttt <- table(grps) + ttt <- ttt[ttt >= max(min.count.run * min.pres, 2)] + uniq.grp <- as.numeric(names(ttt)) + + newprof <- newprof[newprof[, 4] %in% uniq.grp, ] + newprof <- newprof[order(newprof[, 4], newprof[, 1]), ] + r.newprof <- nrow(newprof) + breaks <- compute_breaks_3(newprof[, 4]) + + new.rec <- newprof * 0 + rec.pointer <- 1 + + height.rec <- mz.pres.rec <- time.range.rec <- rep(0, length(breaks)) + mz.pres.ptr <- 1 + + min.run <- round(min.count.run) + + for (m in 2:length(breaks)) + { + this.prof <- newprof[(breaks[m - 1] + 1):breaks[m], ] + + this.prof <- this.prof[order(this.prof[, 2]), ] + this.times <- this.prof[, 2] + this.intensi <- this.prof[, 3] + this.mass <- this.prof[, 1] + this.grp <- this.prof[1, 4] + + this.timeline <- timeline + this.timeline[this.times] <- 1 + + to.keep <- this.times * 0 + + dens <- ksmooth(seq(-min.run + 1, length(this.timeline) + min.run), c(rep(0, min.run), this.timeline, rep(0, min.run)), kernel = "box", bandwidth = min.run, x.points = 1:length(this.timeline)) + dens <- dens$y + + if (max(dens) >= min.pres) { + measured.points <- good.points <- timeline + measured.points[this.times] <- 1 + + good.sel <- which(dens >= min.pres) + good.points[good.sel] <- 1 + for (j in (-min.run):min.run) + { + curr.sel <- good.sel + j + curr.sel <- curr.sel[curr.sel > 0 & curr.sel <= length(times)] + good.points[curr.sel] <- 1 + } + + measured.points <- measured.points * good.points + to.keep[which(this.times %in% which(measured.points == 1))] <- 1 + } + + if (sum(to.keep) > 0) { + this.sel <- which(to.keep == 1) + this.new <- cbind(this.mass[this.sel], this.times[this.sel], this.intensi[this.sel], rep(this.grp, length(this.sel))) + r.new <- nrow(this.new) + height.rec[curr.label] <- max(this.intensi[this.sel]) + time.range.rec[curr.label] <- times[max(this.times[this.sel])] - times[min(this.times[this.sel])] + mz.pres.rec[curr.label] <- length(this.sel) / (max(this.times[this.sel]) - min(this.times[this.sel]) + 1) + curr.label <- curr.label + 1 + + new.rec[rec.pointer:(rec.pointer + r.new - 1), ] <- this.new + rec.pointer <- rec.pointer + r.new + } + } + new.rec <- new.rec[1:(rec.pointer - 1), ] + new.rec[, 2] <- times[new.rec[, 2]] + results <- new("list") + results$new.rec <- new.rec + results$height.rec <- height.rec[1:(curr.label - 1)] + results$time.range.rec <- time.range.rec[1:(curr.label - 1)] + results$mz.pres.rec <- mz.pres.rec[1:(curr.label - 1)] + return(results) +} diff --git a/R/extract_features.R b/R/extract_features.R index ce5374ca..261445fe 100644 --- a/R/extract_features.R +++ b/R/extract_features.R @@ -64,7 +64,7 @@ extract_features <- function( 'msExtrema', 'find_local_maxima', 'combine.seq.3', - 'run_filter', + 'cont.index', 'interpol.area', 'load_file', 'load_data', diff --git a/R/proc.cdf.R b/R/proc.cdf.R index 6aeed534..c210e020 100644 --- a/R/proc.cdf.R +++ b/R/proc.cdf.R @@ -94,10 +94,10 @@ proc.cdf <- function(filename, run.sel <- raw.prof$height.rec[which(raw.prof$height.rec[, 2] >= raw.prof$min.count.run * min_presence & raw.prof$height.rec[, 3] > baseline_correct), 1] newprof <- newprof[newprof[, 4] %in% run.sel, ] - new.prof <- run_filter( + new.prof <- cont.index( newprof, - min_pres = min_presence, - min_run = min_elution_length + min.pres = min_presence, + min.run = min_elution_length ) if (do.plot) { @@ -112,10 +112,10 @@ proc.cdf <- function(filename, } new_rec_tibble <- tibble::tibble( - mz = new.prof$new_rec[, 1], - rt = new.prof$new_rec[, 2], - intensity = new.prof$new_rec[, 3], - group_number = new.prof$new_rec[, 4] + mz = new.prof$new.rec[, 1], + rt = new.prof$new.rec[, 2], + intensity = new.prof$new.rec[, 3], + group_number = new.prof$new.rec[, 4] ) return(new_rec_tibble) diff --git a/R/run_filter.R b/R/run_filter.R deleted file mode 100644 index 7c03882e..00000000 --- a/R/run_filter.R +++ /dev/null @@ -1,143 +0,0 @@ -#' @description -#' Computes unique groups -#' @param min_count_run filter parameter. -#' @param min_pres Run filter parameter. The minimum proportion of presence in the time period for a series of signals grouped -#' by m/z to be considered a peak. -#' @param profile The matrix containing m/z, retention time, intensity, and EIC label as columns. -#' @return unique_grp. -compute_uniq_grp <- function(profile, min_count_run, min_pres = 0.6) { - grps <- profile - ttt <- table(grps) - ttt <- ttt[ttt >= max(min_count_run * min_pres, 2)] - unique_grp <- as.numeric(names(ttt)) - return(unique_grp) -} - -#' @description -#' Computes the smoothed retention times by using The Nadaraya-Watson kernel regression estimate function. -#' @param min_run Run filter parameter. The minimum length of elution time for a series of signals grouped by m/z to be considered a peak. -#' @param times. Retention times vector. -#' @return predicted rt. -#' @examples -#' predict_smoothed_rt(min_run = min_run, times) -predict_smoothed_rt <- function(min_run = 5, times) { - # ksmooth(x, y, kernel, bandwidth, range, n.points, x.points) - smooth <- ksmooth( - seq(-min_run + 1, length(times) + min_run), - c(rep(0, min_run), - times, - rep(0, min_run)), - kernel = "box", - bandwidth = min_run, - x.points = 1:length(times) - ) - # vector of smoothed estimates for the regression at the corresponding x - smooth <- smooth$y - return(smooth) -} - -#' @description -#' This function labels the indices of values kept to perform further calculations -#' @param min_run Run filter parameter. The minimum length of elution time for a series of signals grouped by m/z to be considered a peak. -#' @param min_pres Run filter parameter. The minimum proportion of presence in the time period for a series of signals grouped -#' by m/z to be considered a peak. -#' @param timeline. -#' @param this_times. -#' @param times. Retention times vector. -#' @return to_keep. -label_val_to_keep <- function(min_run, timeline, min_pres, this_times, times) { - this_timeline <- timeline - this_timeline[this_times] <- 1 - to_keep <- this_times * 0 - - # filtering based on the kernel regression estimate - this_smooth <- predict_smoothed_rt(min_run, this_timeline) - if (max(this_smooth) >= min_pres) { - measured_points <- good_points <- timeline - measured_points[this_times] <- 1 - - good_sel <- which(this_smooth >= min_pres) - good_points[good_sel] <- 1 - for (j in (-min_run):min_run) { - curr_sel <- good_sel + j - curr_sel <- curr_sel[curr_sel > 0 & curr_sel <= length(times)] - good_points[curr_sel] <- 1 - } - - measured_points <- measured_points * good_points - to_keep[which(this_times %in% which(measured_points == 1))] <- 1 - } - return(to_keep) -} -#' @description -#' Continuity index. -#' Internal function that removes noise in the retention time dimension. It uses continuity index (or "run filter") to select putative peaks from EIC. -#' @param newprof The matrix containing m/z, retention time, intensity, and EIC label as columns. -#' @param min_pres Run filter parameter. The minimum proportion of presence in the time period for a series of signals grouped -#' by m/z to be considered a peak. -#' @param min_run Run filter parameter. The minimum length of elution time for a series of signals grouped by m/z to be considered a peak. -#' @return A list is returned. new_rec - The matrix containing m/z, retention time, intensity, and EIC label as columns after applying the run filter. -#' @export -#' @examples -#' run_filter(newprof, min_pres = min_pres, min_run = min_run) -run_filter <- function(newprof, - min_pres = 0.6, - min_run = 5) { - - newprof <- tibble::tibble(mz = newprof[,1], rt = newprof[,2], intensi = newprof[,3], grps = newprof[,4]) - - # ordering retention time values - labels <- newprof$rt - times <- unique(labels) - times <- times[order(times)] - - for (i in 1:length(times)) labels[which(newprof$rt == times[i])] <- i #now labels is the index of time points - newprof$rt <- labels - - # calculates the minimun number of rt points to be considered a peak - min_count_run <- min_run * length(times) / (max(times) - min(times)) - min_run <- round(min_count_run) - - # computes unique groups - uniq_grp <- compute_uniq_grp(newprof$grps, min_count_run) - - # ordered by mz and grps data that are inside unigrps - newprof <- dplyr::filter(newprof, grps %in% uniq_grp) |> dplyr::arrange(grps, mz) - - # computes break points i.e. indices of mass differences greater than min_mz_tol - breaks <- compute_breaks_3(newprof$grps) - - # init counters for loop - new_rec <- newprof * 0 - rec_pointer <- 1 - timeline <- rep(0, length(times)) - for (m in 2:length(breaks)) - { - this_prof <- dplyr::slice(newprof, (breaks[m - 1] + 1):breaks[m]) |> dplyr::arrange_at("rt") - - to_keep <- label_val_to_keep( - min_run, - timeline, - min_pres, - this_prof$rt, - times - ) - - # operation over selected indices - if (sum(to_keep) > 0) { - this_sel <- which(to_keep == 1) - this_new <- dplyr::slice(this_prof, this_sel) - r_new <- nrow(this_new) - new_rec[rec_pointer:(rec_pointer + r_new - 1), ] <- this_new - rec_pointer <- rec_pointer + r_new - } - } - - new_rec <- dplyr::slice(new_rec, 1:(rec_pointer - 1)) - new_rec[, 2] <- times[new_rec[, 2]] - - results <- new("list") - results$new_rec <- new_rec - - return(results) -} diff --git a/README.md b/README.md index 42283784..259273da 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,6 @@ wget -P tests/testdata/recovered -i tests/remote-files/recovered.txt wget -P tests/testdata/recovered/recovered-extracted -i tests/remote-files/recovered-extracted.txt wget -P tests/testdata/recovered/recovered-corrected -i tests/remote-files/recovered-corrected.txt wget -P tests/testdata/filtered -i tests/remote-files/filtered.txt -wget -P tests/testdata/filtered/run_filter -i tests/remote-files/run_filter.txt wget -P tests/testdata/features -i tests/remote-files/features.txt wget -P tests/testdata/clusters -i tests/remote-files/clusters.txt wget -P tests/testdata/hybrid -i tests/remote-files/hybrid.txt diff --git a/tests/remote-files/run_filter.txt b/tests/remote-files/run_filter.txt deleted file mode 100644 index 3ea29298..00000000 --- a/tests/remote-files/run_filter.txt +++ /dev/null @@ -1,2 +0,0 @@ -https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/run_filter/mbr_test0_run_filter.parquet -https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/input/mbr_test0.parquet \ No newline at end of file diff --git a/tests/testthat/test-run_filter.R b/tests/testthat/test-run_filter.R deleted file mode 100644 index b0773af3..00000000 --- a/tests/testthat/test-run_filter.R +++ /dev/null @@ -1,33 +0,0 @@ -patrick::with_parameters_test_that( - "test run_filter", - { - if(ci_skip == TRUE) skip_on_ci() - - testdata <- file.path("..", "testdata") - input_path <- file.path(testdata, "filtered", "run_filter", filename) - - input_data <- as.matrix(arrow::read_parquet(input_path) ) - actual <- run_filter(input_data, min_pres, min_run) - - actual <- tibble::tibble( - mz = actual$new_rec[, 1], - rt = actual$new_rec[, 2], - intensity = actual$new_rec[, 3], - group_number = actual$new_rec[, 4] - ) - - expected_path <- file.path(testdata, "filtered", "run_filter", paste0(.test_name, "_run_filter.parquet")) - expected <- arrow::read_parquet(expected_path) - - expect_equal(actual, expected) - - }, - patrick::cases( - mbr_test0 = list( - filename = c("mbr_test0.parquet"), - min_pres = 0.5, - min_run = 12, - ci_skip = FALSE - ) - ) -)