Skip to content

Commit

Permalink
Merge pull request #34 from maximskorik/two-step-hybrid-refactoring
Browse files Browse the repository at this point in the history
Two-step hybrid refactoring
  • Loading branch information
maximskorik authored May 10, 2022
2 parents 267989c + 78065ba commit 1301763
Show file tree
Hide file tree
Showing 18 changed files with 27,559 additions and 238 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ Description: This is a customized fork of the original work from Tianwei Yu.
It takes the adaptive processing of LC/MS metabolomics data further
with focus on high resolution MS for both LC and GC applications.
Depends: R (>= 2.10), MASS, rgl, mzR, splines, doParallel, foreach,
iterators, snow, gbm, e1071, randomForest, ROCR, ROCS, Rcpp
iterators, snow, gbm, e1071, randomForest, ROCR, ROCS, Rcpp, dplyr, tidyr, stringr, tibble
biocViews: Technology, MassSpectrometry
License: GPL(>=2) + file LICENSE
LazyLoad: yes
NeedsCompilation: no
Suggests:
arrow, testthat (>= 3.0.0)
arrow, dataCompareR, testthat (>= 3.0.0)
Config/testthat/edition: 3
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,5 @@ export(target.search)
export(two.step.hybrid)
export(unsupervised)
export(hybrid)
export(extract_pattern_colnames)
export(sort_samples_by_acquisition_number)
25 changes: 7 additions & 18 deletions R/adjust.time.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
adjust.time <-
function(features,mz.tol=NA, chr.tol=NA,colors=NA,find.tol.max.d=1e-4, max.align.mz.diff=0.01, do.plot=TRUE) # features is a list project, each sub-object is a matrix as identified by prof.to.features
function(features,mz.tol=NA, chr.tol=NA,colors=NA,find.tol.max.d=1e-4, max.align.mz.diff=0.01, do.plot=TRUE, rt_colname="pos") # features is a list project, each sub-object is a matrix as identified by prof.to.features
{

num.exp<-nrow(summary(features))
Expand All @@ -11,22 +11,11 @@ function(features,mz.tol=NA, chr.tol=NA,colors=NA,find.tol.max.d=1e-4, max.align
text(x=0,y=0,"Retention time \n adjustment",cex=2)
}

a<-summary(features)
sizes<-as.numeric(a[,1])/ncol(features[[1]])
sizes<-cumsum(sizes)
# sel<-max(which(sizes<=5e6))
sel<-length(sizes)

mz<-chr<-lab<-rep(0, sizes[sel])
sizes<-c(0, sizes)

for(i in 1:sel)
{
mz[(sizes[i]+1):sizes[i+1]]<-features[[i]][,1]
chr[(sizes[i]+1):sizes[i+1]]<-features[[i]][,2]
lab[(sizes[i]+1):sizes[i+1]]<-i
}

values <- get_feature_values(features, rt_colname)
mz <- values$mz
chr <- values$chr
lab <- values$lab

if(is.na(mz.tol))
{
mz.tol<-find.tol(mz,uppermost=find.tol.max.d, do.plot=do.plot)
Expand Down Expand Up @@ -71,7 +60,7 @@ function(features,mz.tol=NA, chr.tol=NA,colors=NA,find.tol.max.d=1e-4, max.align
this.feature<-features[[j]]
if(j != template)
{
this.comb<-rbind(cbind(candi, rep(template,nrow(candi))),cbind(this.feature[,1:2],rep(j,nrow(this.feature))))
this.comb<-rbind(cbind(candi, label = rep(template,nrow(candi))),cbind(this.feature[,1:2],label = rep(j,nrow(this.feature))))
this.comb<-this.comb[order(this.comb[,1]),]
l<-nrow(this.comb)

Expand Down
23 changes: 7 additions & 16 deletions R/feature.align.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
feature.align <-
function(features, min.exp=2,mz.tol=NA,chr.tol=NA,find.tol.max.d=1e-4, max.align.mz.diff=0.01, do.plot=TRUE) # returns a list of aligned features and original peak times
function(features, min.exp=2,mz.tol=NA,chr.tol=NA,find.tol.max.d=1e-4, max.align.mz.diff=0.01, do.plot=TRUE, rt_colname="pos") # returns a list of aligned features and original peak times
{
if (do.plot) {
par(mfrow=c(3,2))
Expand Down Expand Up @@ -28,20 +28,11 @@ function(features, min.exp=2,mz.tol=NA,chr.tol=NA,find.tol.max.d=1e-4, max.align
num.exp<-nrow(summary(features))
if(num.exp>1)
{
a<-summary(features)
sizes<-as.numeric(a[,1])/ncol(features[[1]])
sizes<-cumsum(sizes)
sel<-length(sizes)

masses<-chr<-lab<-rep(0, sizes[sel])
sizes<-c(0, sizes)

for(i in 1:sel)
{
masses[(sizes[i]+1):sizes[i+1]]<-features[[i]][,1]
chr[(sizes[i]+1):sizes[i+1]]<-features[[i]][,2]
lab[(sizes[i]+1):sizes[i+1]]<-i
}
values <- get_feature_values(features, rt_colname)
masses <- values$mz
chr <- values$chr
lab <- values$lab

o<-order(masses, chr)
masses<-masses[o]
chr<-chr[o]
Expand Down Expand Up @@ -81,7 +72,7 @@ function(features, min.exp=2,mz.tol=NA,chr.tol=NA,find.tol.max.d=1e-4, max.align

area<-grps<-masses


sizes <- c(0, cumsum(sapply(features, nrow)))
for(i in 1:num.exp)
{
this<-features[[i]]
Expand Down
4 changes: 2 additions & 2 deletions R/peak.characterize.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ function(existing.row=NA, ftrs.row, chr.row)
if(n==1) x<-c(x,mean0)
mean1<-mean(x)
sd1<-sd(x)
min1<-min(x)
max1<-max(x)
min1<-min(x, Inf)
max1<-max(x, -Inf)
}else{
m<-length(x)
mean1<-sum(mean0*n, x)/(n+m)
Expand Down
86 changes: 52 additions & 34 deletions R/semi.sup.R
Original file line number Diff line number Diff line change
@@ -1,27 +1,48 @@
semi.sup <-
function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.pres=0.5, min.run=12, mz.tol=1e-5, baseline.correct.noise.percentile=0.05, shape.model="bi-Gaussian", BIC.factor=2, baseline.correct=0, peak.estim.method="moment", min.bw=NA, max.bw=NA, sd.cut=c(0.01,500), sigma.ratio.lim=c(0.01, 100), component.eliminate=0.01, moment.power=1, subs=NULL, align.mz.tol=NA, align.chr.tol=NA, max.align.mz.diff=0.01, pre.process=FALSE, recover.mz.range=NA, recover.chr.range=NA, use.observed.range=TRUE, match.tol.ppm=NA, new.feature.min.count=2, recover.min.count=3, intensity.weighted=FALSE)
semi.sup <- function(
files,
folder,
known.table=NA,
n.nodes=4,
min.exp=2,
min.pres=0.5,
min.run=12,
mz.tol=1e-5,
baseline.correct.noise.percentile=0.05,
shape.model="bi-Gaussian",
BIC.factor=2,
baseline.correct=0,
peak.estim.method="moment",
min.bw=NA,
max.bw=NA,
sd.cut=c(0.01,500),
sigma.ratio.lim=c(0.01, 100),
component.eliminate=0.01,
moment.power=1,
align.mz.tol=NA,
align.chr.tol=NA,
max.align.mz.diff=0.01,
pre.process=FALSE,
recover.mz.range=NA,
recover.chr.range=NA,
use.observed.range=TRUE,
match.tol.ppm=NA,
new.feature.min.count=2,
recover.min.count=3,
intensity.weighted=FALSE)
{
setwd(folder)

files<-dir(pattern=file.pattern, ignore.case = TRUE)
files<-files[order(files)]
if(!is.null(subs))
{
if(!is.na(subs[1])) files<-files[subs]
}

###############################################################################################

try(dir.create("error_files"), silent = TRUE)
message("***************************** prifiles --> feature lists *****************************")
message("** extracting features")
suf.prof<-paste(min.pres,min.run,mz.tol,baseline.correct,sep="_")
suf<-paste(suf.prof, shape.model, sd.cut[1], sd.cut[2],component.eliminate, moment.power, sep="_")
if(shape.model=="bi-Gaussian") suf<-paste(suf, sigma.ratio.lim[1], sigma.ratio.lim[2],sep="_")

to.do<-paste(matrix(unlist(strsplit(tolower(files),"\\.")),nrow=2)[1,],suf, min.bw, max.bw,".feature",sep="_")
to.do<-which(!(to.do %in% dir()))
message(c("number of files to process: ", length(to.do)))

to.do<-which(!(to.do %in% dir()))

if(length(to.do)>0)
{
Expand All @@ -31,7 +52,7 @@ function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.
cl <- makeCluster(n.nodes)
registerDoParallel(cl)
#clusterEvalQ(cl, source("~/Desktop/Dropbox/1-work/apLCMS_code/new_proc_cdf.r"))
clusterEvalQ(cl, library(apLCMS))
clusterEvalQ(cl, library(recetox.aplcms))


features<-foreach(i=2:length(grps)) %dopar%
Expand Down Expand Up @@ -81,14 +102,13 @@ function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.
for(i in 1:length(files))
{
this.name<-paste(strsplit(tolower(files[i]),"\\.")[[1]][1],suf, min.bw, max.bw,".feature",sep="_")
cat(this.name, " ")
load(this.name)
features[[i]]<-this.feature
}

###############################################################################################
message("****************************** time correction ***************************************")
suf<-paste(suf,align.mz.tol,align.chr.tol,subs[1],subs[length(subs)],sep="_")
message("** correcting time...")
suf<-paste(suf,align.mz.tol,align.chr.tol,files[1],files[length(files)],sep="_")
this.name<-paste("time_correct_done_",suf,".bin",sep="")

all.files<-dir()
Expand All @@ -100,9 +120,9 @@ function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.
cl <- makeCluster(n.nodes)
registerDoParallel(cl)
#clusterEvalQ(cl, source("~/Desktop/Dropbox/1-work/apLCMS_code/new_proc_cdf.r"))
clusterEvalQ(cl, library(apLCMS))
clusterEvalQ(cl, library(recetox.aplcms))

message(c("***** correcting time, CPU time (seconds) ",as.vector(system.time(f2<-adjust.time(features,mz.tol=align.mz.tol, chr.tol=align.chr.tol, find.tol.max.d=10*mz.tol, max.align.mz.diff=max.align.mz.diff)))[1]))
message(c("** correction time, CPU time (seconds) ",as.vector(system.time(f2<-adjust.time(features,mz.tol=align.mz.tol, chr.tol=align.chr.tol, find.tol.max.d=10*mz.tol, max.align.mz.diff=max.align.mz.diff)))[1]))

stopCluster(cl)
save(f2,file=this.name)
Expand All @@ -113,7 +133,7 @@ function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.
gc()

###############################################################################################
message("**************************** aligning features **************************************")
message("** aligning features...")
suf<-paste(suf,min.exp,sep="_")
this.name<-paste("aligned_done_",suf,".bin",sep="")
all.files<-dir()
Expand All @@ -125,9 +145,9 @@ function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.
cl <- makeCluster(n.nodes)
registerDoParallel(cl)
#clusterEvalQ(cl, source("~/Desktop/Dropbox/1-work/apLCMS_code/new_proc_cdf.r"))
clusterEvalQ(cl, library(apLCMS))
clusterEvalQ(cl, library(recetox.aplcms))

message(c("***** aligning features, CPU time (seconds): ", as.vector(system.time(aligned<-feature.align(f2, min.exp=min.exp,mz.tol=align.mz.tol,chr.tol=align.chr.tol, find.tol.max.d=10*mz.tol, max.align.mz.diff=max.align.mz.diff)))[1]))
message(c("** aligned features, CPU time (seconds): ", as.vector(system.time(aligned<-feature.align(f2, min.exp=min.exp,mz.tol=align.mz.tol,chr.tol=align.chr.tol, find.tol.max.d=10*mz.tol, max.align.mz.diff=max.align.mz.diff)))[1]))
save(aligned,file=this.name)
stopCluster(cl)
}else{
Expand All @@ -137,7 +157,7 @@ function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.


###############################################################################################
message("************************* merging to known peak table *********************************")
message("** merging to known peak table")
if(is.na(match.tol.ppm)) match.tol.ppm<-aligned$mz.tol*1e6

this.name<-paste("merge_done_", suf, ".bin", sep="")
Expand Down Expand Up @@ -242,13 +262,11 @@ function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.
gc()

###############################################################################################
message("**************************** recovering weaker signals *******************************")
message("** recovering weaker signals")
suf<-paste(suf,recover.mz.range, recover.chr.range, use.observed.range,match.tol.ppm,new.feature.min.count,recover.min.count,sep="_")

worklist<-paste(matrix(unlist(strsplit(tolower(files),"\\.")),nrow=2)[1,],suf,"semi_sup.recover",sep="_")
to.do<-which(!(worklist %in% dir()))
message(c("number of files to process: ", length(to.do)))

to.do<-which(!(worklist %in% dir()))
if(length(to.do)>0)
{
grps<-round(seq(0, length(to.do), length=n.nodes+1))
Expand All @@ -257,7 +275,7 @@ function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.
cl <- makeCluster(n.nodes)
registerDoParallel(cl)
#clusterEvalQ(cl, source("~/Desktop/Dropbox/1-work/apLCMS_code/new_proc_cdf.r"))
clusterEvalQ(cl, library(apLCMS))
clusterEvalQ(cl, library(recetox.aplcms))

features.recov<-foreach(i=2:length(grps)) %dopar%
{
Expand All @@ -275,7 +293,7 @@ function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.


##############################################################################################
message("loading feature tables after recovery")
message("** loading feature tables after recovery...")
features.recov<-new("list")

for(i in 1:length(files))
Expand All @@ -286,7 +304,7 @@ function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.
}

##############################################################################################
message("****************************** second round time correction *************************")
message("** second round time correction...")
suf<-paste(suf,"round 2",sep="_")
this.name<-paste("time_correct_done_",suf,".bin",sep="")

Expand All @@ -298,17 +316,17 @@ function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.
cl <- makeCluster(n.nodes)
registerDoParallel(cl)
#clusterEvalQ(cl, source("~/Desktop/Dropbox/1-work/apLCMS_code/new_proc_cdf.r"))
clusterEvalQ(cl, library(apLCMS))
clusterEvalQ(cl, library(recetox.aplcms))

message(c("***** correcting time, CPU time (seconds) ",as.vector(system.time(f2.recov<-adjust.time(features.recov, mz.tol=align.mz.tol, chr.tol=align.chr.tol, find.tol.max.d=10*mz.tol, max.align.mz.diff=max.align.mz.diff)))[1]))
message(c("** correction time, CPU time (seconds) ",as.vector(system.time(f2.recov<-adjust.time(features.recov, mz.tol=align.mz.tol, chr.tol=align.chr.tol, find.tol.max.d=10*mz.tol, max.align.mz.diff=max.align.mz.diff)))[1]))
save(f2.recov,file=this.name)
stopCluster(cl)
}else{
load(this.name)
}
gc()
###############################################################################################
message("**************************** second round aligning features *************************")
message("** second round aligning features...")
suf<-paste(suf,"min_exp", min.exp, 1,sep="_")
this.name<-paste("aligned_done_",suf,".bin",sep="")
all.files<-dir()
Expand All @@ -318,9 +336,9 @@ function(folder, file.pattern=".cdf", known.table=NA, n.nodes=4, min.exp=2, min.
cl <- makeCluster(n.nodes)
registerDoParallel(cl)
#clusterEvalQ(cl, source("~/Desktop/Dropbox/1-work/apLCMS_code/new_proc_cdf.r"))
clusterEvalQ(cl, library(apLCMS))
clusterEvalQ(cl, library(recetox.aplcms))

message(c("***** aligning features, CPU time (seconds): ", as.vector(system.time(aligned.recov<-feature.align(f2.recov, min.exp=min.exp,mz.tol=align.mz.tol,chr.tol=align.chr.tol, find.tol.max.d=10*mz.tol, max.align.mz.diff=max.align.mz.diff)))[1]))
message(c("** aligned features, CPU time (seconds): ", as.vector(system.time(aligned.recov<-feature.align(f2.recov, min.exp=min.exp,mz.tol=align.mz.tol,chr.tol=align.chr.tol, find.tol.max.d=10*mz.tol, max.align.mz.diff=max.align.mz.diff)))[1]))
save(aligned.recov,file=this.name)
stopCluster(cl)
}else{
Expand Down
Loading

0 comments on commit 1301763

Please sign in to comment.