RECETOX · hechth · Nov 3, 2022 · Oct 18, 2022 · Oct 18, 2022 · Oct 18, 2022
diff --git a/.github/workflows/r-conda.yml b/.github/workflows/r-conda.yml
@@ -54,6 +54,8 @@ jobs:
         run: wget -P tests/testdata/recovered/recovered-corrected -i tests/remote-files/recovered-corrected.txt
       - name: Fetch filtered test data
         run: wget -P tests/testdata/filtered -i tests/remote-files/filtered.txt
+      - name: Fetch run_filter test data
+      - run: wget -P tests/testdata/filtered/run_filter -i tests/remote-files/run_filter.txt
       - name: Fetch features test data
         run: wget -P tests/testdata/features -i tests/remote-files/features.txt
       - name: Fetch clusters test data

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,12 +11,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - added tests for `feature.align.R` ([#40](https://github.com/RECETOX/recetox-aplcms/pull/40)), and `adjust.time.R` ([#39](https://github.com/RECETOX/recetox-aplcms/pull/40))
 - added CI to repository's GitHub Actions [#45](https://github.com/RECETOX/recetox-aplcms/pull/45),[#49](https://github.com/RECETOX/recetox-aplcms/pull/49)
 - added additional test cases for hybrid [#133](https://github.com/RECETOX/recetox-aplcms/pull/133)
+- added tests and testdata for run_filter.R (#155)
 ### Changed
 - refactored `feature.align.R` [#63](https://github.com/RECETOX/recetox-aplcms/pull/63)[#88](https://github.com/RECETOX/recetox-aplcms/pull/88)[#102](https://github.com/RECETOX/recetox-aplcms/pull/102)
 - refactored `adjust.time.R` [#64](https://github.com/RECETOX/recetox-aplcms/pull/64)[#102](https://github.com/RECETOX/recetox-aplcms/pull/102)
 - refactored `find.tol.time.R` [#91](https://github.com/RECETOX/recetox-aplcms/pull/91)
 - refactored `find.turn.point.R` [#91](https://github.com/RECETOX/recetox-aplcms/pull/91)
 - refactored `proc.cdf.R` and `adaptive.bin.R` [#137](https://github.com/RECETOX/recetox-aplcms/pull/137)
+- refactored `cont.index.R` and renamed as `run_filter.R` [#xx](https://github.com/RECETOX/recetox-aplcms/pull/155)
+- Updated function names `proc.cdf.R` and `extract_features.R` [#xx](https://github.com/RECETOX/recetox-aplcms/pull/155)
+- refactored `cont.index.R` and renamed as `run_filter.R` [#xx](https://github.com/RECETOX/recetox-aplcms/pull/155)
 - use proper sample IDs inside feature tables [#153](https://github.com/RECETOX/recetox-aplcms/pull/153)
 ### Removed
 

diff --git a/NAMESPACE b/NAMESPACE
@@ -28,7 +28,6 @@ export(compute_mz_sd)
 export(compute_scale)
 export(compute_start_bound)
 export(compute_target_times)
-export(cont.index)
 export(draw_rt_correction_plot)
 export(draw_rt_normal_peaks)
 export(duplicate.row.remove)
@@ -68,6 +67,7 @@ export(prof.to.features)
 export(recover.weaker)
 export(rev_cum_sum)
 export(rm.ridge)
+export(run_filter)
 export(semi.sup)
 export(sort_samples_by_acquisition_number)
 export(span)

diff --git a/R/cont.index.R b/R/cont.index.R
diff --git a/R/extract_features.R b/R/extract_features.R
@@ -64,7 +64,7 @@ extract_features <- function(
     'msExtrema',
     'find_local_maxima',
     'combine.seq.3',
-    'cont.index',
+    'run_filter',
     'interpol.area',
     'load_file',
     'load_data',

diff --git a/R/proc.cdf.R b/R/proc.cdf.R
@@ -94,10 +94,10 @@ proc.cdf <- function(filename,
   run.sel <- raw.prof$height.rec[which(raw.prof$height.rec[, 2] >= raw.prof$min.count.run * min_presence & raw.prof$height.rec[, 3] > baseline_correct), 1]
 
   newprof <- newprof[newprof[, 4] %in% run.sel, ]
-  new.prof <- cont.index(
+  new.prof <- run_filter(
     newprof,
-    min.pres = min_presence,
-    min.run = min_elution_length
+    min_pres = min_presence,
+    min_run = min_elution_length
   )
 
   if (do.plot) {
@@ -112,10 +112,10 @@ proc.cdf <- function(filename,
   }
 
   new_rec_tibble <- tibble::tibble(
-    mz = new.prof$new.rec[, 1],
-    rt = new.prof$new.rec[, 2],
-    intensity = new.prof$new.rec[, 3],
-    group_number = new.prof$new.rec[, 4]
+    mz = new.prof$new_rec[, 1],
+    rt = new.prof$new_rec[, 2],
+    intensity = new.prof$new_rec[, 3],
+    group_number = new.prof$new_rec[, 4]
   )
 
   return(new_rec_tibble)

diff --git a/R/run_filter.R b/R/run_filter.R
@@ -0,0 +1,144 @@
+#' @description
+#' Computes unique groups
+#' @param min_count_run filter parameter. 
+#' @param min_pres Run filter parameter. The minimum proportion of presence in the time period for a series of signals grouped
+#'  by m/z to be considered a peak.
+#' @param profile The matrix containing m/z, retention time, intensity, and EIC label as columns.
+#' @return unique_grp. 
+compute_uniq_grp <- function(profile, min_count_run, min_pres = 0.6) {
+  grps <- profile  
+  ttt <- table(grps)
+  ttt <- ttt[ttt >= max(min_count_run * min_pres, 2)]
+  unique_grp <- as.numeric(names(ttt))
+  return(unique_grp)
+}
+
+#' @description
+#' Computes the smoothed retention times by using The Nadaraya-Watson kernel regression estimate function.
+#' @param min_run Run filter parameter. The minimum length of elution time for a series of signals grouped by m/z to be considered a peak.
+#' @param times. Retention times vector.
+#' @return predicted rt.
+#' @examples
+#' predict_smoothed_rt(min_run = min_run, times)
+predict_smoothed_rt <- function(min_run = 5, times) {
+  # ksmooth(x, y, kernel, bandwidth, range, n.points, x.points)
+  smooth <- ksmooth(
+    seq(-min_run + 1, length(times) + min_run), 
+    c(rep(0, min_run),   
+    times,       
+    rep(0, min_run)),    
+    kernel = "box",      
+    bandwidth = min_run, 
+    x.points = 1:length(times) 
+  ) 
+  # vector of smoothed estimates for the regression at the corresponding x
+  smooth <- smooth$y  
+  return(smooth)
+}
+
+#' @description
+#' This function labels the indices of values kept to perform further calculations
+#' @param min_run Run filter parameter. The minimum length of elution time for a series of signals grouped by m/z to be considered a peak.
+#' @param min_pres Run filter parameter. The minimum proportion of presence in the time period for a series of signals grouped
+#'  by m/z to be considered a peak.
+#' @param timeline. 
+#' @param this_times. 
+#' @param times. Retention times vector.
+#' @return to_keep. 
+label_val_to_keep <- function(min_run, timeline, min_pres, this_times, times) {
+    this_timeline <- timeline
+    this_timeline[this_times] <- 1
+    to_keep <- this_times * 0
+
+    # filtering based on the kernel regression estimate
+    this_smooth <- predict_smoothed_rt(min_run, this_timeline)
+    if (max(this_smooth) >= min_pres) {
+      measured_points <- good_points <- timeline
+      measured_points[this_times] <- 1
+
+      good_sel <- which(this_smooth >= min_pres)
+      good_points[good_sel] <- 1
+      for (j in (-min_run):min_run)
+      {
+        curr_sel <- good_sel + j
+        curr_sel <- curr_sel[curr_sel > 0 & curr_sel <= length(times)]
+        good_points[curr_sel] <- 1
+      }
+
+      measured_points <- measured_points * good_points
+      to_keep[which(this_times %in% which(measured_points == 1))] <- 1     
+    }
+    return(to_keep)
+}
+#' @description
+#' Continuity index. 
+#' Internal function that removes noise in the retention time dimension. It uses continuity index (or "run filter") to select putative peaks from EIC. 
+#' @param newprof The matrix containing m/z, retention time, intensity, and EIC label as columns.
+#' @param min_pres Run filter parameter. The minimum proportion of presence in the time period for a series of signals grouped
+#' by m/z to be considered a peak.
+#' @param min_run Run filter parameter. The minimum length of elution time for a series of signals grouped by m/z to be considered a peak.
+#' @return A list is returned. new_rec - The matrix containing m/z, retention time, intensity, and EIC label as columns after applying the run filter.
+#' @export
+#' @examples
+#' run_filter(newprof, min_pres = min_pres, min_run = min_run)
+run_filter <- function(newprof,
+                       min_pres = 0.6,
+                       min_run = 5) {
+
+  newprof <- tibble::tibble(mz = newprof[,1], rt = newprof[,2], intensi = newprof[,3], grps = newprof[,4])
+
+  # ordering retention time values
+  labels <- newprof$rt
+  times <- unique(labels)
+  times <- times[order(times)]  
+
+  for (i in 1:length(times)) labels[which(newprof$rt == times[i])] <- i #now labels is the index of time points
+  newprof$rt <- labels  
+
+  # calculates the minimun number of rt points to be considered a peak
+  min_count_run <- min_run * length(times) / (max(times) - min(times))
+  min_run <- round(min_count_run)
+
+  # computes unique groups 
+  uniq_grp <- compute_uniq_grp(newprof$grps, min_count_run)
+
+  # ordered by mz and grps data that are inside unigrps
+  newprof <- dplyr::filter(newprof, grps %in% uniq_grp) |> dplyr::arrange(grps, mz)
+
+  # computes break points i.e. indices of mass differences greater than min_mz_tol
+  breaks <- compute_breaks_3(newprof$grps)
+
+  # init counters for loop
+  new_rec <- newprof * 0
+  rec_pointer <- 1
+  timeline <- rep(0, length(times))
+  for (m in 2:length(breaks))
+  {
+    this_prof <- dplyr::slice(newprof, (breaks[m - 1] + 1):breaks[m]) |> dplyr::arrange_at("rt")
+
+    to_keep <- label_val_to_keep(
+      min_run,
+      timeline,
+      min_pres,
+      this_prof$rt,
+      times
+    )
+
+    # operation over selected indices 
+    if (sum(to_keep) > 0) {
+      this_sel <- which(to_keep == 1)
+      this_new <- dplyr::slice(this_prof, this_sel)
+      r_new <- nrow(this_new) 
+      new_rec[rec_pointer:(rec_pointer + r_new - 1), ] <- this_new
+      rec_pointer <- rec_pointer + r_new
+    }
+  }
+
+  new_rec <- dplyr::slice(new_rec, 1:(rec_pointer - 1)) 
+  new_rec[, 2] <- times[new_rec[, 2]]
+
+  results <- new("list")
+  results$new_rec <- new_rec
+
+  return(results)
+}
diff --git a/README.md b/README.md
@@ -21,6 +21,7 @@ wget -P tests/testdata/recovered -i tests/remote-files/recovered.txt
 wget -P tests/testdata/recovered/recovered-extracted -i tests/remote-files/recovered-extracted.txt
 wget -P tests/testdata/recovered/recovered-corrected -i tests/remote-files/recovered-corrected.txt
 wget -P tests/testdata/filtered -i tests/remote-files/filtered.txt
+wget -P tests/testdata/filtered/run_filter -i tests/remote-files/run_filter.txt
 wget -P tests/testdata/features -i tests/remote-files/features.txt
 wget -P tests/testdata/clusters -i tests/remote-files/clusters.txt
 wget -P tests/testdata/hybrid -i tests/remote-files/hybrid.txt

diff --git a/tests/remote-files/run_filter.txt b/tests/remote-files/run_filter.txt
@@ -0,0 +1 @@
+https://gitlab.ics.muni.cz/umsa/umsa-files/-/raw/master/testdata/recetox-aplcms/filtered/run_filter/mbr_test0_run_filter.parquet
diff --git a/tests/testthat/test-run_filter.R b/tests/testthat/test-run_filter.R
@@ -0,0 +1,33 @@
+patrick::with_parameters_test_that(
+  "test run_filter.R",
+  {
+    if(ci_skip == TRUE) skip_on_ci()
+
+    testdata <- file.path("..", "testdata")
+    input_path <- file.path(testdata, "input", filename)
+
+    input_data <- as.matrix(arrow::read_parquet(input_path) ) 
+    actual <- run_filter(input_data, min_pres, min_run)
+
+    actual <- tibble::tibble(
+      mz = actual$new_rec[, 1],
+      rt = actual$new_rec[, 2],
+      intensity = actual$new_rec[, 3],
+      group_number = actual$new_rec[, 4]
+      )
+
+    expected_path <- file.path(testdata, "filtered", "run_filter", paste0(.test_name, "_run_filter.parquet"))
+    expected <- arrow::read_parquet(expected_path) 
+
+    expect_equal(actual, expected)
+
+  },
+  patrick::cases(
+    mbr_test0 = list(
+      filename = c("mbr_test0.parquet"),
+      min_pres = 0.5,
+      min_run = 12,
+      ci_skip = FALSE
+    )
+  )
+)