Merge pull request #34 from maximskorik/two-step-hybrid-refactoring

Two-step hybrid refactoring
RECETOX · May 10, 2022 · 1301763 · 1301763
2 parents 267989c + 78065ba
commit 1301763
Show file tree

Hide file tree

Showing 18 changed files with 27,559 additions and 238 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -9,11 +9,11 @@ Description: This is a customized fork of the original work from Tianwei Yu.
         It takes the adaptive processing of LC/MS metabolomics data further
         with focus on high resolution MS for both LC and GC applications.
 Depends: R (>= 2.10), MASS, rgl, mzR, splines, doParallel, foreach,
-        iterators, snow, gbm, e1071, randomForest, ROCR, ROCS, Rcpp
+        iterators, snow, gbm, e1071, randomForest, ROCR, ROCS, Rcpp, dplyr, tidyr, stringr, tibble
 biocViews: Technology, MassSpectrometry
 License: GPL(>=2) + file LICENSE
 LazyLoad: yes
 NeedsCompilation: no
 Suggests: 
-    arrow, testthat (>= 3.0.0)
+    arrow, dataCompareR, testthat (>= 3.0.0)
 Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
@@ -39,3 +39,5 @@ export(target.search)
 export(two.step.hybrid)
 export(unsupervised)
 export(hybrid)
+export(extract_pattern_colnames)
+export(sort_samples_by_acquisition_number)
diff --git a/R/adjust.time.R b/R/adjust.time.R
@@ -1,5 +1,5 @@
 adjust.time <-
-function(features,mz.tol=NA, chr.tol=NA,colors=NA,find.tol.max.d=1e-4, max.align.mz.diff=0.01, do.plot=TRUE)  # features is a list project, each sub-object is a matrix as identified by prof.to.features
+function(features,mz.tol=NA, chr.tol=NA,colors=NA,find.tol.max.d=1e-4, max.align.mz.diff=0.01, do.plot=TRUE, rt_colname="pos")  # features is a list project, each sub-object is a matrix as identified by prof.to.features
 {
 
     num.exp<-nrow(summary(features))
@@ -11,22 +11,11 @@ function(features,mz.tol=NA, chr.tol=NA,colors=NA,find.tol.max.d=1e-4, max.align
             text(x=0,y=0,"Retention time \n adjustment",cex=2)
         }
 
-        a<-summary(features)
-        sizes<-as.numeric(a[,1])/ncol(features[[1]])
-        sizes<-cumsum(sizes)
-        #		sel<-max(which(sizes<=5e6))
-        sel<-length(sizes)
-
-        mz<-chr<-lab<-rep(0, sizes[sel])
-        sizes<-c(0, sizes)
-
-        for(i in 1:sel)
-        {
-            mz[(sizes[i]+1):sizes[i+1]]<-features[[i]][,1]
-            chr[(sizes[i]+1):sizes[i+1]]<-features[[i]][,2]
-            lab[(sizes[i]+1):sizes[i+1]]<-i
-        }
-
+        values <- get_feature_values(features, rt_colname)
+        mz <- values$mz
+        chr <- values$chr
+        lab <- values$lab
+
         if(is.na(mz.tol))
         {
             mz.tol<-find.tol(mz,uppermost=find.tol.max.d, do.plot=do.plot)
@@ -71,7 +60,7 @@ function(features,mz.tol=NA, chr.tol=NA,colors=NA,find.tol.max.d=1e-4, max.align
             this.feature<-features[[j]]
             if(j != template)
             {
-                this.comb<-rbind(cbind(candi, rep(template,nrow(candi))),cbind(this.feature[,1:2],rep(j,nrow(this.feature))))
+                this.comb<-rbind(cbind(candi, label = rep(template,nrow(candi))),cbind(this.feature[,1:2],label = rep(j,nrow(this.feature))))
                 this.comb<-this.comb[order(this.comb[,1]),]
                 l<-nrow(this.comb)
 

diff --git a/R/feature.align.R b/R/feature.align.R
@@ -1,5 +1,5 @@
 feature.align <-
-function(features, min.exp=2,mz.tol=NA,chr.tol=NA,find.tol.max.d=1e-4, max.align.mz.diff=0.01, do.plot=TRUE)     # returns a list of aligned features and original peak times
+function(features, min.exp=2,mz.tol=NA,chr.tol=NA,find.tol.max.d=1e-4, max.align.mz.diff=0.01, do.plot=TRUE, rt_colname="pos")     # returns a list of aligned features and original peak times
 {
     if (do.plot) {
         par(mfrow=c(3,2))
@@ -28,20 +28,11 @@ function(features, min.exp=2,mz.tol=NA,chr.tol=NA,find.tol.max.d=1e-4, max.align
     num.exp<-nrow(summary(features))
     if(num.exp>1)
     {
-        a<-summary(features)
-        sizes<-as.numeric(a[,1])/ncol(features[[1]])
-        sizes<-cumsum(sizes)
-        sel<-length(sizes)
-
-        masses<-chr<-lab<-rep(0, sizes[sel])
-        sizes<-c(0, sizes)
-
-        for(i in 1:sel)
-        {
-            masses[(sizes[i]+1):sizes[i+1]]<-features[[i]][,1]
-            chr[(sizes[i]+1):sizes[i+1]]<-features[[i]][,2]
-            lab[(sizes[i]+1):sizes[i+1]]<-i
-        }
+        values <- get_feature_values(features, rt_colname)
+        masses <- values$mz
+        chr <- values$chr
+        lab <- values$lab
+
         o<-order(masses, chr)
         masses<-masses[o]
         chr<-chr[o]
@@ -81,7 +72,7 @@ function(features, min.exp=2,mz.tol=NA,chr.tol=NA,find.tol.max.d=1e-4, max.align
 
         area<-grps<-masses
 
-
+        sizes <- c(0, cumsum(sapply(features, nrow)))
         for(i in 1:num.exp)
         {
             this<-features[[i]]

diff --git a/R/peak.characterize.R b/R/peak.characterize.R
@@ -9,8 +9,8 @@ function(existing.row=NA, ftrs.row, chr.row)
             if(n==1) x<-c(x,mean0)
             mean1<-mean(x)
             sd1<-sd(x)
-            min1<-min(x)
-            max1<-max(x)
+            min1<-min(x, Inf)
+            max1<-max(x, -Inf)
         }else{
             m<-length(x)
             mean1<-sum(mean0*n, x)/(n+m)

diff --git a/R/semi.sup.R b/R/semi.sup.R
@@ -1,27 +1,48 @@
-semi.sup <-
-function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.pres=0.5, min.run=12, mz.tol=1e-5, baseline.correct.noise.percentile=0.05, shape.model="bi-Gaussian", BIC.factor=2, baseline.correct=0, peak.estim.method="moment", min.bw=NA, max.bw=NA, sd.cut=c(0.01,500), sigma.ratio.lim=c(0.01, 100), component.eliminate=0.01, moment.power=1, subs=NULL, align.mz.tol=NA, align.chr.tol=NA, max.align.mz.diff=0.01, pre.process=FALSE, recover.mz.range=NA, recover.chr.range=NA, use.observed.range=TRUE, match.tol.ppm=NA, new.feature.min.count=2, recover.min.count=3, intensity.weighted=FALSE)
+semi.sup <- function(
+    files,
+    folder,
+    known.table=NA,
+    n.nodes=4,
+    min.exp=2,
+    min.pres=0.5,
+    min.run=12,
+    mz.tol=1e-5,
+    baseline.correct.noise.percentile=0.05,
+    shape.model="bi-Gaussian",
+    BIC.factor=2,
+    baseline.correct=0,
+    peak.estim.method="moment",
+    min.bw=NA,
+    max.bw=NA,
+    sd.cut=c(0.01,500),
+    sigma.ratio.lim=c(0.01, 100),
+    component.eliminate=0.01,
+    moment.power=1,
+    align.mz.tol=NA,
+    align.chr.tol=NA,
+    max.align.mz.diff=0.01,
+    pre.process=FALSE,
+    recover.mz.range=NA,
+    recover.chr.range=NA,
+    use.observed.range=TRUE,
+    match.tol.ppm=NA,
+    new.feature.min.count=2,
+    recover.min.count=3,
+    intensity.weighted=FALSE)
 {
     setwd(folder)
-
-    files<-dir(pattern=file.pattern, ignore.case = TRUE)
     files<-files[order(files)]
-    if(!is.null(subs))
-    {
-        if(!is.na(subs[1])) files<-files[subs]
-    }
 
     ###############################################################################################
 
     try(dir.create("error_files"), silent = TRUE)
-    message("***************************** prifiles --> feature lists *****************************")
+    message("** extracting features")
     suf.prof<-paste(min.pres,min.run,mz.tol,baseline.correct,sep="_")
     suf<-paste(suf.prof, shape.model, sd.cut[1], sd.cut[2],component.eliminate, moment.power, sep="_")
     if(shape.model=="bi-Gaussian") suf<-paste(suf, sigma.ratio.lim[1], sigma.ratio.lim[2],sep="_")
 
     to.do<-paste(matrix(unlist(strsplit(tolower(files),"\\.")),nrow=2)[1,],suf, min.bw, max.bw,".feature",sep="_")
-    to.do<-which(!(to.do %in% dir()))
-    message(c("number of files to process: ", length(to.do)))
-
+    to.do<-which(!(to.do %in% dir()))    
 
     if(length(to.do)>0)
     {
@@ -31,7 +52,7 @@ function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.
         cl <- makeCluster(n.nodes)
         registerDoParallel(cl)
         #clusterEvalQ(cl, source("~/Desktop/Dropbox/1-work/apLCMS_code/new_proc_cdf.r"))
-        clusterEvalQ(cl, library(apLCMS))
+        clusterEvalQ(cl, library(recetox.aplcms))
 
 
         features<-foreach(i=2:length(grps)) %dopar%
@@ -81,14 +102,13 @@ function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.
     for(i in 1:length(files))
     {
         this.name<-paste(strsplit(tolower(files[i]),"\\.")[[1]][1],suf, min.bw, max.bw,".feature",sep="_")
-        cat(this.name, " ")
         load(this.name)
         features[[i]]<-this.feature
     }
 
     ###############################################################################################
-    message("****************************** time correction ***************************************")
-    suf<-paste(suf,align.mz.tol,align.chr.tol,subs[1],subs[length(subs)],sep="_")
+    message("** correcting time...")
+    suf<-paste(suf,align.mz.tol,align.chr.tol,files[1],files[length(files)],sep="_")
     this.name<-paste("time_correct_done_",suf,".bin",sep="")
 
     all.files<-dir()
@@ -100,9 +120,9 @@ function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.
         cl <- makeCluster(n.nodes)
         registerDoParallel(cl)
         #clusterEvalQ(cl, source("~/Desktop/Dropbox/1-work/apLCMS_code/new_proc_cdf.r"))
-        clusterEvalQ(cl, library(apLCMS))
+        clusterEvalQ(cl, library(recetox.aplcms))
 
-        message(c("***** correcting time, CPU time (seconds) ",as.vector(system.time(f2<-adjust.time(features,mz.tol=align.mz.tol, chr.tol=align.chr.tol, find.tol.max.d=10*mz.tol, max.align.mz.diff=max.align.mz.diff)))[1]))
+        message(c("** correction time, CPU time (seconds) ",as.vector(system.time(f2<-adjust.time(features,mz.tol=align.mz.tol, chr.tol=align.chr.tol, find.tol.max.d=10*mz.tol, max.align.mz.diff=max.align.mz.diff)))[1]))
 
         stopCluster(cl)
         save(f2,file=this.name)
@@ -113,7 +133,7 @@ function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.
     gc()
 
     ###############################################################################################
-    message("****************************  aligning features **************************************")
+    message("** aligning features...")
     suf<-paste(suf,min.exp,sep="_")
     this.name<-paste("aligned_done_",suf,".bin",sep="")
     all.files<-dir()
@@ -125,9 +145,9 @@ function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.
         cl <- makeCluster(n.nodes)
         registerDoParallel(cl)
         #clusterEvalQ(cl, source("~/Desktop/Dropbox/1-work/apLCMS_code/new_proc_cdf.r"))
-        clusterEvalQ(cl, library(apLCMS))
+        clusterEvalQ(cl, library(recetox.aplcms))
 
-        message(c("***** aligning features, CPU time (seconds): ", as.vector(system.time(aligned<-feature.align(f2, min.exp=min.exp,mz.tol=align.mz.tol,chr.tol=align.chr.tol, find.tol.max.d=10*mz.tol, max.align.mz.diff=max.align.mz.diff)))[1]))
+        message(c("** aligned features, CPU time (seconds): ", as.vector(system.time(aligned<-feature.align(f2, min.exp=min.exp,mz.tol=align.mz.tol,chr.tol=align.chr.tol, find.tol.max.d=10*mz.tol, max.align.mz.diff=max.align.mz.diff)))[1]))
         save(aligned,file=this.name)
         stopCluster(cl)
     }else{
@@ -137,7 +157,7 @@ function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.
 
 
     ###############################################################################################
-    message("************************* merging to known peak table *********************************")
+    message("** merging to known peak table")
     if(is.na(match.tol.ppm)) match.tol.ppm<-aligned$mz.tol*1e6
 
     this.name<-paste("merge_done_", suf, ".bin", sep="")
@@ -242,13 +262,11 @@ function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.
     gc()
 
     ###############################################################################################
-    message("**************************** recovering weaker signals *******************************")
+    message("** recovering weaker signals")
     suf<-paste(suf,recover.mz.range, recover.chr.range, use.observed.range,match.tol.ppm,new.feature.min.count,recover.min.count,sep="_")
 
     worklist<-paste(matrix(unlist(strsplit(tolower(files),"\\.")),nrow=2)[1,],suf,"semi_sup.recover",sep="_")
-    to.do<-which(!(worklist %in% dir()))
-    message(c("number of files to process: ", length(to.do)))
-
+    to.do<-which(!(worklist %in% dir()))    
     if(length(to.do)>0)
     {
         grps<-round(seq(0, length(to.do), length=n.nodes+1))
@@ -257,7 +275,7 @@ function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.
         cl <- makeCluster(n.nodes)
         registerDoParallel(cl)
         #clusterEvalQ(cl, source("~/Desktop/Dropbox/1-work/apLCMS_code/new_proc_cdf.r"))
-        clusterEvalQ(cl, library(apLCMS))
+        clusterEvalQ(cl, library(recetox.aplcms))
 
         features.recov<-foreach(i=2:length(grps)) %dopar%
         {
@@ -275,7 +293,7 @@ function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.
 
 
     ##############################################################################################
-    message("loading feature tables after recovery")
+    message("** loading feature tables after recovery...")
     features.recov<-new("list")
 
     for(i in 1:length(files))
@@ -286,7 +304,7 @@ function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.
     }
 
     ##############################################################################################
-    message("****************************** second round time correction *************************")
+    message("** second round time correction...")
     suf<-paste(suf,"round 2",sep="_")
     this.name<-paste("time_correct_done_",suf,".bin",sep="")
 
@@ -298,17 +316,17 @@ function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.
         cl <- makeCluster(n.nodes)
         registerDoParallel(cl)
         #clusterEvalQ(cl, source("~/Desktop/Dropbox/1-work/apLCMS_code/new_proc_cdf.r"))
-        clusterEvalQ(cl, library(apLCMS))
+        clusterEvalQ(cl, library(recetox.aplcms))
 
-        message(c("***** correcting time, CPU time (seconds) ",as.vector(system.time(f2.recov<-adjust.time(features.recov, mz.tol=align.mz.tol, chr.tol=align.chr.tol, find.tol.max.d=10*mz.tol, max.align.mz.diff=max.align.mz.diff)))[1]))
+        message(c("** correction time, CPU time (seconds) ",as.vector(system.time(f2.recov<-adjust.time(features.recov, mz.tol=align.mz.tol, chr.tol=align.chr.tol, find.tol.max.d=10*mz.tol, max.align.mz.diff=max.align.mz.diff)))[1]))
         save(f2.recov,file=this.name)
         stopCluster(cl)
     }else{
         load(this.name)
     }
     gc()
     ###############################################################################################
-    message("**************************** second round aligning features *************************")
+    message("** second round aligning features...")
     suf<-paste(suf,"min_exp", min.exp, 1,sep="_")
     this.name<-paste("aligned_done_",suf,".bin",sep="")
     all.files<-dir()
@@ -318,9 +336,9 @@ function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.
         cl <- makeCluster(n.nodes)
         registerDoParallel(cl)
         #clusterEvalQ(cl, source("~/Desktop/Dropbox/1-work/apLCMS_code/new_proc_cdf.r"))
-        clusterEvalQ(cl, library(apLCMS))
+        clusterEvalQ(cl, library(recetox.aplcms))
 
-        message(c("***** aligning features, CPU time (seconds): ", as.vector(system.time(aligned.recov<-feature.align(f2.recov, min.exp=min.exp,mz.tol=align.mz.tol,chr.tol=align.chr.tol, find.tol.max.d=10*mz.tol, max.align.mz.diff=max.align.mz.diff)))[1]))
+        message(c("** aligned features, CPU time (seconds): ", as.vector(system.time(aligned.recov<-feature.align(f2.recov, min.exp=min.exp,mz.tol=align.mz.tol,chr.tol=align.chr.tol, find.tol.max.d=10*mz.tol, max.align.mz.diff=max.align.mz.diff)))[1]))
         save(aligned.recov,file=this.name)
         stopCluster(cl)
     }else{