diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 55069af9..3a49acbf 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -12,19 +12,29 @@
"features": {
"git": {
"version": "latest",
- "ppa": false
+ "ppa": false,
},
"common": {
"installZsh": "false",
"username": "false",
-
}
},
// Add the IDs of extensions you want installed when the container is created.
- "extensions": ["ikuyadeu.r"],
+ "extensions": [
+ "reditorsupport.r",
+ "rdebugger.r-debugger",
+ "eamodio.gitlens",
+ "mutantdino.resourcemonitor",
+ "meakbiyik.vscode-r-test-adapter",
+ "dvirtz.parquet-viewer"
+ ],
+ "settings": {
+ "r.rterm.linux": "/bin/local/miniconda/envs/recetox-aplcms/bin/radian",
+ "r.rpath.linux": "/bin/local/miniconda/envs/recetox-aplcms/bin/R"
+ },
- "onCreateCommand": "apt update && apt install -y locales && locale-gen en_US.UTF-8",
+ "onCreateCommand": "apt update && apt install -y locales && locale-gen en_US.UTF-8 && git config --global --add safe.directory /workspaces/recetox-aplcms",
"postAttachCommand": "/bin/bash"
}
diff --git a/DESCRIPTION b/DESCRIPTION
index 0e9a580b..2c3a32a3 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -17,3 +17,4 @@ NeedsCompilation: no
Suggests:
arrow, dataCompareR, testthat (>= 3.0.0)
Config/testthat/edition: 3
+RoxygenNote: 7.2.0
diff --git a/R/adaptive.bin.R b/R/adaptive.bin.R
index c87e6755..f70eefd4 100644
--- a/R/adaptive.bin.R
+++ b/R/adaptive.bin.R
@@ -1,162 +1,191 @@
-adaptive.bin <-
-function(x, min.run, min.pres, tol, baseline.correct, weighted=FALSE)
-{
- masses<-x$masses
- labels<-x$labels
- intensi<-x$intensi
- times<-x$times
-
- rm(x)
-
- base.curve<-unique(times)
- base.curve<-base.curve[order(base.curve)]
- base.curve<-cbind(base.curve, base.curve*0)
-
- curr.order<-order(masses)
- intensi<-intensi[curr.order]
- labels<-labels[curr.order]
- masses<-masses[curr.order]
-
- rm(curr.order)
-
- cat(c("m/z tolerance is: ", tol,"\n"))
-
- l<-length(masses)
-
- curr.bw<-0.5*tol*max(masses)
- if(weighted)
- {
- all.mass.den<-density(masses, weights=intensi/sum(intensi), bw=curr.bw, n=2^min(15, floor(log2(l))-2))
- }else{
- all.mass.den<-density(masses, bw=curr.bw, n=2^min(15, floor(log2(l))-2))
- }
- all.mass.turns<-find.turn.point(all.mass.den$y)
- all.mass.vlys<-all.mass.den$x[all.mass.turns$vlys]
- breaks<-c(0, unique(round(approx(masses,1:l,xout=all.mass.vlys,rule=2,ties='ordered')$y))[-1])
-
- grps<-masses*0 # this is which peak group the signal belongs to, not time group
-
- min.lab<-min(labels)
- max.lab<-max(labels)
- times<-unique(labels)
- times<-times[order(times)]
- curr.label<-1
- min.count.run<-min.run*length(times)/(max(times)-min(times))
- time.range<-diff(range(times))
- aver.time.range<-(max(labels)-min(labels))/length(times)
-
- newprof<-matrix(0, nrow=length(masses),ncol=4)
- height.rec<-matrix(0, nrow=length(masses),ncol=3)
- prof.pointer<-1
- height.pointer<-1
-
- for(i in 1:(length(breaks)-1))
- {
- this.labels<-labels[(breaks[i]+1):breaks[i+1]]
- if(length(unique(this.labels)) >= min.count.run*min.pres)
- {
- this.masses<-masses[(breaks[i]+1):breaks[i+1]]
- this.intensi<-intensi[(breaks[i]+1):breaks[i+1]]
-
- curr.order<-order(this.labels)
- this.masses<-this.masses[curr.order]
- this.intensi<-this.intensi[curr.order]
- this.labels<-this.labels[curr.order]
-
- this.bw=0.5*tol*median(this.masses)
- if(weighted)
- {
- mass.den<-density(this.masses, weights=this.intensi/sum(this.intensi), bw=this.bw)
- }else{
- mass.den<-density(this.masses, bw=this.bw)
- }
- mass.den$y[mass.den$y < min(this.intensi)/10]<-0
- mass.turns<-find.turn.point(mass.den$y)
- mass.pks<-mass.den$x[mass.turns$pks]
- mass.vlys<-c(-Inf, mass.den$x[mass.turns$vlys], Inf)
-
-
- for(j in 1:length(mass.pks))
- {
- mass.lower<-max(mass.vlys[mass.vlys < mass.pks[j]])
- mass.upper<-min(mass.vlys[mass.vlys > mass.pks[j]])
-
- if(length(mass.pks) == 1) mass.lower<-mass.lower-1
- mass.sel<-which(this.masses > mass.lower & this.masses <= mass.upper)
-
- if(length(mass.sel) > 0)
- {
- that.labels<-this.labels[mass.sel]
- that.masses<-this.masses[mass.sel]
- that.intensi<-this.intensi[mass.sel]
-
- that.merged<-combine.seq.3(that.labels, that.masses, that.intensi)
- if(nrow(that.merged)==1)
- {
- new.merged<-that.merged
- }else{
- new.merged<-that.merged[order(that.merged[,1]),]
- }
-
- that.labels<-new.merged[,2]
- that.masses<-new.merged[,1]
- that.intensi<-new.merged[,3]
- that.range<-diff(range(that.labels))
-
- if(that.range > 0.5*time.range & length(that.labels) > that.range*min.pres & length(that.labels)/(diff(range(that.labels))/aver.time.range) > min.pres)
- {
- that.intensi<-rm.ridge(that.labels, that.intensi, bw=max(10*min.run, that.range/2))
-
- that.sel<-which(that.intensi != 0)
- that.labels<-that.labels[that.sel]
- that.masses<-that.masses[that.sel]
- that.intensi<-that.intensi[that.sel]
- }
- that.n<-length(that.masses)
- newprof[prof.pointer:(prof.pointer+that.n-1),]<-cbind(that.masses, that.labels, that.intensi, rep(curr.label, that.n))
- prof.pointer<-prof.pointer+that.n
- height.rec[height.pointer,]<-c(curr.label,that.n, max(that.intensi))
- height.pointer<-height.pointer+1
- curr.label<-curr.label+1
- }
- }
- }else{
- if(runif(1)<0.05)
- {
- this.masses<-masses[(breaks[i]+1):breaks[i+1]]
- this.intensi<-intensi[(breaks[i]+1):breaks[i+1]]
- curr.order<-order(this.labels)
- this.masses<-this.masses[curr.order]
- this.intensi<-this.intensi[curr.order]
- this.labels<-this.labels[curr.order]
-
-
- that.merged<-combine.seq.3(this.labels, this.masses, this.intensi)
- that.n<-nrow(that.merged)
- newprof[prof.pointer:(prof.pointer+that.n-1),]<-cbind(that.merged, rep(curr.label, that.n))
- prof.pointer<-prof.pointer+that.n
-
- height.rec[height.pointer,]<-c(curr.label,that.n, max(that.merged[,3]))
- height.pointer<-height.pointer+1
- curr.label<-curr.label+1
- }
+compute_densities <- function(masses, tol, weighted, intensities, bw_func, n = 512) {
+ bandwidth <- 0.5 * tol * bw_func(masses)
+ if (weighted) {
+ weights <- intensities / sum(intensities)
+ all.mass.den <- density(masses, weights = weights, bw = bandwidth, n = n)
+ } else {
+ all.mass.den <- density(masses, bw = bandwidth, n = n)
+ }
+ return(all.mass.den)
+}
+
+compute_mass_values <- function(tol, masses, intensi, weighted) {
+ n <- 2^min(15, floor(log2(length(masses))) - 2)
+
+ all.mass.den <- compute_densities(masses, tol, weighted, intensi, max, n)
+
+ all.mass.turns <- find.turn.point(all.mass.den$y)
+ all.mass.vlys <- all.mass.den$x[all.mass.turns$vlys]
+ return(all.mass.vlys)
+}
+
+compute_breaks <- function(tol, masses, intensi, weighted) {
+ all.mass.vlys <- compute_mass_values(tol, masses, intensi, weighted)
+ breaks <- c(0, unique(round(approx(masses, 1:length(masses), xout = all.mass.vlys, rule = 2, ties = "ordered")$y))[-1])
+ return(breaks)
+}
+
+adaptive.bin <- function(x,
+ min.run,
+ min.pres,
+ tol,
+ baseline.correct,
+ weighted = FALSE) {
+ # order inputs after mz values
+ masses <- x$masses
+ labels <- x$labels
+ intensi <- x$intensi
+ times <- x$times
+
+ rm(x)
+
+ curr.order <- order(masses)
+ intensi <- intensi[curr.order]
+ labels <- labels[curr.order]
+ masses <- masses[curr.order]
+
+ rm(curr.order)
+
+ cat(c("m/z tolerance is: ", tol, "\n"))
+
+ times <- unique(labels)
+ times <- times[order(times)]
+
+ # calculate function parameters
+ min.count.run <- min.run * length(times) / (max(times) - min(times))
+ time.range <- diff(range(times))
+ aver.time.range <- (max(labels) - min(labels)) / length(times)
+
+ # init data
+ newprof <- matrix(0, nrow = length(masses), ncol = 4)
+ height.rec <- matrix(0, nrow = length(masses), ncol = 3)
+
+ # init counters
+ curr.label <- 1
+ prof.pointer <- 1
+ height.pointer <- 1
+
+ breaks <- compute_breaks(tol, masses, intensi, weighted)
+
+ for (i in 1:(length(breaks) - 1))
+ {
+ start <- breaks[i] + 1
+ end <- breaks[i + 1]
+ # get number of scans in bin
+ this.labels <- labels[start: end]
+
+ if (length(unique(this.labels)) >= min.count.run * min.pres) {
+ # extract mz and intensity values for bin
+ this.masses <- masses[start:end]
+ this.intensi <- intensi[start:end]
+
+ # reorder in order of labels (scan number)
+ curr.order <- order(this.labels)
+ this.masses <- this.masses[curr.order]
+ this.intensi <- this.intensi[curr.order]
+ this.labels <- this.labels[curr.order]
+
+ mass.den <- compute_densities(this.masses, tol, weighted, this.intensi, median)
+
+ mass.den$y[mass.den$y < min(this.intensi) / 10] <- 0
+ mass.turns <- find.turn.point(mass.den$y)
+ mass.pks <- mass.den$x[mass.turns$pks]
+ mass.vlys <- c(-Inf, mass.den$x[mass.turns$vlys], Inf)
+
+
+ for (j in 1:length(mass.pks))
+ {
+ # compute boundaries
+ mass.lower <- max(mass.vlys[mass.vlys < mass.pks[j]])
+ mass.upper <- min(mass.vlys[mass.vlys > mass.pks[j]])
+
+ if (length(mass.pks) == 1) mass.lower <- mass.lower - 1
+
+ # compute if we are in mass range from mass.lower to mass.upper
+ mass.sel <- which(this.masses > mass.lower & this.masses <= mass.upper)
+
+ if (length(mass.sel) > 0) {
+
+ # get rows which fulfill condition
+ that.labels <- this.labels[mass.sel]
+ that.masses <- this.masses[mass.sel]
+ that.intensi <- this.intensi[mass.sel]
+
+ # rearrange in order of labels
+ that.merged <- combine.seq.3(that.labels, that.masses, that.intensi)
+ if (nrow(that.merged) == 1) {
+ new.merged <- that.merged
+ } else {
+ new.merged <- that.merged[order(that.merged[, 1]), ]
+ }
+
+ that.labels <- new.merged[, 2]
+ that.masses <- new.merged[, 1]
+ that.intensi <- new.merged[, 3]
+ that.range <- diff(range(that.labels))
+
+ if (that.range > 0.5 * time.range & length(that.labels) > that.range * min.pres & length(that.labels) / (diff(range(that.labels)) / aver.time.range) > min.pres) {
+ that.intensi <- rm.ridge(that.labels, that.intensi, bw = max(10 * min.run, that.range / 2))
+
+ # filter out 0 entries
+ that.sel <- which(that.intensi != 0)
+ that.labels <- that.labels[that.sel]
+ that.masses <- that.masses[that.sel]
+ that.intensi <- that.intensi[that.sel]
+ }
+
+ that.n <- length(that.masses)
+
+ newprof[prof.pointer:(prof.pointer + that.n - 1), ] <- cbind(that.masses, that.labels, that.intensi, rep(curr.label, that.n))
+ height.rec[height.pointer, ] <- c(curr.label, that.n, max(that.intensi))
+
+ # increment counters
+ prof.pointer <- prof.pointer + that.n
+ height.pointer <- height.pointer + 1
+ curr.label <- curr.label + 1
}
+ }
+ } else {
+ if (runif(1) < 0.05) {
+
+ # reassignment
+ this.masses <- masses[start:end]
+ this.intensi <- intensi[start:end]
+
+ # reordering
+ curr.order <- order(this.labels)
+ this.masses <- this.masses[curr.order]
+ this.intensi <- this.intensi[curr.order]
+ this.labels <- this.labels[curr.order]
+
+ that.merged <- combine.seq.3(this.labels, this.masses, this.intensi)
+ that.n <- nrow(that.merged)
+
+ newprof[prof.pointer:(prof.pointer + that.n - 1), ] <- cbind(that.merged, rep(curr.label, that.n))
+ height.rec[height.pointer, ] <- c(curr.label, that.n, max(that.merged[, 3]))
+
+ # increment counters
+ prof.pointer <- prof.pointer + that.n
+ height.pointer <- height.pointer + 1
+ curr.label <- curr.label + 1
+ }
}
-
- newprof<-newprof[1:(prof.pointer-1),]
- height.rec<-height.rec[1:(height.pointer-1),]
-
- newprof<-newprof[order(newprof[,1],newprof[,2]),]
-
- raw.prof<-new("list")
- raw.prof$height.rec<-height.rec
- raw.prof$masses<-newprof[,1]
- raw.prof$labels<-newprof[,2]
- raw.prof$intensi<-newprof[,3]
- raw.prof$grps<-newprof[,4]
- raw.prof$times<-times
- raw.prof$tol<-tol
- raw.prof$min.count.run<-min.count.run
-
- return(raw.prof)
+ }
+
+ newprof <- newprof[1:(prof.pointer - 1), ]
+ height.rec <- height.rec[1:(height.pointer - 1), ]
+
+ newprof <- newprof[order(newprof[, 1], newprof[, 2]), ]
+
+ raw.prof <- new("list")
+ raw.prof$height.rec <- height.rec
+ raw.prof$masses <- newprof[, 1]
+ raw.prof$labels <- newprof[, 2]
+ raw.prof$intensi <- newprof[, 3]
+ raw.prof$grps <- newprof[, 4]
+ raw.prof$times <- times
+ raw.prof$tol <- tol
+ raw.prof$min.count.run <- min.count.run
+
+ return(raw.prof)
}
diff --git a/R/extract_features.R b/R/extract_features.R
new file mode 100644
index 00000000..eb8ea1e2
--- /dev/null
+++ b/R/extract_features.R
@@ -0,0 +1,63 @@
+extract_features <- function(
+ cluster,
+ filenames,
+ min_pres,
+ min_run,
+ mz_tol,
+ baseline_correct,
+ baseline_correct_noise_percentile,
+ intensity_weighted,
+ min_bandwidth,
+ max_bandwidth,
+ sd_cut,
+ sigma_ratio_lim,
+ shape_model,
+ peak_estim_method,
+ component_eliminate,
+ moment_power,
+ BIC_factor
+) {
+ clusterExport(cluster, list(
+ 'proc.cdf',
+ 'prof.to.features',
+ 'load.lcms',
+ 'adaptive.bin',
+ 'find.turn.point',
+ 'combine.seq.3',
+ 'cont.index',
+ 'interpol.area',
+ 'load_file',
+ 'load_data',
+ 'plot_raw_profile_histogram',
+ 'compute_mass_values',
+ 'compute_densities',
+ 'compute_breaks'
+ ))
+
+ parLapply(cluster, filenames, function(filename) {
+ profile <- proc.cdf(
+ filename = filename,
+ min.pres = min_pres,
+ min.run = min_run,
+ tol = mz_tol,
+ baseline.correct = baseline_correct,
+ baseline.correct.noise.percentile = baseline_correct_noise_percentile,
+ intensity.weighted = intensity_weighted,
+ do.plot = FALSE,
+ cache = FALSE
+ )
+ features <- prof.to.features(
+ a = profile,
+ min.bw = min_bandwidth,
+ max.bw = max_bandwidth,
+ sd.cut = sd_cut,
+ sigma.ratio.lim = sigma_ratio_lim,
+ shape.model = shape_model,
+ estim.method = peak_estim_method,
+ component.eliminate = component_eliminate,
+ power = moment_power,
+ BIC.factor = BIC_factor,
+ do.plot = FALSE
+ )
+ })
+}
diff --git a/R/plot.R b/R/plot.R
new file mode 100644
index 00000000..972f35da
--- /dev/null
+++ b/R/plot.R
@@ -0,0 +1,34 @@
+plot_raw_profile_histogram <- function(
+ raw.prof,
+ min.pres,
+ baseline.correct,
+ baseline.correct.noise.percentile,
+ tol,
+ new.prof
+) {
+ h.1 <- log10(raw.prof$height.rec[raw.prof$height.rec[, 2] <= max(2, raw.prof$min.count.run * min.pres / 2), 3])
+ h.2 <- log10(raw.prof$height.rec[raw.prof$height.rec[, 2] >= raw.prof$min.count.run * min.pres, 3])
+
+ if (is.na(baseline.correct)) {
+ baseline.correct <- 10^quantile(h.1, baseline.correct.noise.percentile)
+ message(c("maximal height cut is automatically set at the", baseline.correct.noise.percentile, "percentile of noise group heights: ", baseline.correct))
+ } else {
+ message(c("maximal height cut is provided by user: ", baseline.correct))
+ }
+ par(mfrow = c(2, 2))
+
+ plot(c(-1, 1), c(-1, 1), type = "n", xlab = "", ylab = "", main = "tolerance level loaded", axes = FALSE)
+ text(x = 0, y = 0, tol, cex = 1.2)
+
+ if (length(h.1) > 50) {
+ plot(density(h.1), xlab = "maximum height of group (log scale)", xlim = range(c(h.1, h.2)), main = "Black - noise groups \n Blue - selected groups")
+ } else {
+ plot(NA, NA, xlab = "maximum height of group (log scale)", xlim = range(c(h.1, h.2)), ylim = c(0, 1), main = "Black - noise groups \n Blue - selected groups")
+ if (length(h.1) > 0) abline(v = h.1)
+ }
+
+ abline(v = log10(baseline.correct), col = "red")
+ lines(density(log10(new.prof$height.rec)), col = "blue")
+ hist(new.prof$time.range.rec, xlab = "Range of retention time in the same group", ylab = "Density", freq = FALSE, nclass = 100, main = "Group retention time range distribution")
+ hist(new.prof$mz.pres.rec, xlab = "% signal present in the same group", ylab = "Density", freq = FALSE, nclass = 20, main = "Group % present signal distribution")
+}
\ No newline at end of file
diff --git a/R/proc.cdf.R b/R/proc.cdf.R
index ce82f8fc..7c97c159 100644
--- a/R/proc.cdf.R
+++ b/R/proc.cdf.R
@@ -1,59 +1,73 @@
-proc.cdf <- function(filename, min.pres = 0.5, min.run = 12, tol = 1e-5, baseline.correct = 0, baseline.correct.noise.percentile = 0, do.plot = TRUE, intensity.weighted = FALSE, cache = TRUE) {
+load_file <- function(filename) {
+ this <- load.lcms(filename)
+
+ # this could eventually be replaced using drop_na
+ na.sel <- c(which(is.na(this$masses)), which(is.na(this$labels)), which(is.na(this$intensi)))
+ if (length(na.sel) > 0) {
+ na.sel <- unique(na.sel)
+ this$masses <- this$masses[-na.sel]
+ this$labels <- this$labels[-na.sel]
+ this$intensi <- this$intensi[-na.sel]
+
+ warning("there are NA values in the m/z or intensity. Check the file:", filename)
+ }
+ # TODO
+ # this <- tidyr::drop_na(as.data.frame(this))
+ return(this)
+}
+
+
+load_data <- function(filename,
+ cache,
+ min.run,
+ min.pres,
+ tol,
+ baseline.correct,
+ intensity.weighted) {
rawprof_filename <- paste(strsplit(tolower(filename), "\\.")[[1]][1], "_", min.run, "_", min.pres, "_", tol, ".rawprof", sep = "")
if (cache && file.exists(rawprof_filename)) {
load(rawprof_filename)
} else {
- this <- load.lcms(filename)
-
- na.sel <- c(which(is.na(this$masses)), which(is.na(this$labels)), which(is.na(this$intensi)))
- if (length(na.sel) > 0) {
- na.sel <- unique(na.sel)
- this$masses <- this$masses[-na.sel]
- this$labels <- this$labels[-na.sel]
- this$intensi <- this$intensi[-na.sel]
-
- warning("there are NA values in the m/z or intensity. Check the file:", filename)
- }
- raw.prof <- adaptive.bin(this, min.run = min.run, min.pres = min.pres, tol = tol, baseline.correct = baseline.correct, weighted = intensity.weighted)
+ raw.data <- load_file(filename)
+ raw.prof <- adaptive.bin(raw.data, min.run = min.run, min.pres = min.pres, tol = tol, baseline.correct = baseline.correct, weighted = intensity.weighted)
}
if (cache && !file.exists(rawprof_filename)) {
save(raw.prof, file = rawprof_filename)
}
- newprof <- cbind(raw.prof$masses, raw.prof$labels, raw.prof$intensi, raw.prof$grps)
- h.1 <- log10(raw.prof$height.rec[raw.prof$height.rec[, 2] <= max(2, raw.prof$min.count.run * min.pres / 2), 3])
- h.2 <- log10(raw.prof$height.rec[raw.prof$height.rec[, 2] >= raw.prof$min.count.run * min.pres, 3])
+ return(raw.prof)
+}
- if (is.na(baseline.correct)) {
- baseline.correct <- 10^quantile(h.1, baseline.correct.noise.percentile)
- message(c("maximal height cut is automatically set at the", baseline.correct.noise.percentile, "percentile of noise group heights: ", baseline.correct))
- } else {
- message(c("maximal height cut is provided by user: ", baseline.correct))
- }
+#' Process file and return the profile.
+#' @export
+proc.cdf <- function(filename,
+ min.pres = 0.5,
+ min.run = 12,
+ tol = 1e-05,
+ baseline.correct = 0.0,
+ baseline.correct.noise.percentile = 0.05,
+ do.plot = FALSE,
+ intensity.weighted = FALSE,
+ cache = FALSE) {
+ raw.prof <- load_data(filename, cache, min.run, min.pres, tol, baseline.correct, intensity.weighted)
+ newprof <- cbind(raw.prof$masses, raw.prof$labels, raw.prof$intensi, raw.prof$grps)
run.sel <- raw.prof$height.rec[which(raw.prof$height.rec[, 2] >= raw.prof$min.count.run * min.pres & raw.prof$height.rec[, 3] > baseline.correct), 1]
- newprof <- newprof[newprof[, 4] %in% run.sel,]
+
+ newprof <- newprof[newprof[, 4] %in% run.sel, ]
new.prof <- cont.index(newprof, min.pres = min.pres, min.run = min.run)
if (do.plot) {
- par(mfrow = c(2, 2))
-
- plot(c(-1,1), c(-1,1), type = "n", xlab = "", ylab = "", main = "tolerance level loaded", axes = FALSE)
- text(x = 0, y = 0, tol, cex = 1.2)
-
- if (length(h.1) > 50) {
- plot(density(h.1), xlab = "maximum height of group (log scale)", xlim = range(c(h.1, h.2)), main = "Black - noise groups \n Blue - selected groups")
- } else {
- plot(NA, NA, xlab = "maximum height of group (log scale)", xlim = range(c(h.1, h.2)), ylim = c(0, 1), main = "Black - noise groups \n Blue - selected groups")
- if (length(h.1) > 0) abline(v = h.1)
- }
-
- abline(v = log10(baseline.correct), col = "red")
- lines(density(log10(new.prof$height.rec)), col = "blue")
- hist(new.prof$time.range.rec, xlab = "Range of retention time in the same group", ylab = "Density", freq = FALSE, nclass = 100, main = "Group retention time range distribution")
- hist(new.prof$mz.pres.rec, xlab = "% signal present in the same group", ylab = "Density", freq = FALSE, nclass = 20, main = "Group % present signal distribution")
+ plot_raw_profile_histogram(
+ raw.prof,
+ min.pres,
+ baseline.correct,
+ baseline.correct.noise.percentile,
+ tol,
+ new.prof
+ )
}
return(new.prof$new.rec)
diff --git a/R/prof.to.features.R b/R/prof.to.features.R
index 54084d9e..c3c7efac 100644
--- a/R/prof.to.features.R
+++ b/R/prof.to.features.R
@@ -1,712 +1,683 @@
-prof.to.features <-
-function(a, bandwidth=0.5, min.bw=NA, max.bw=NA, sd.cut=c(0.1,100), sigma.ratio.lim=c(0.1, 10), shape.model="bi-Gaussian", estim.method="moment",do.plot=TRUE, power=1, component.eliminate=0.01, BIC.factor=2)
-{
- if(sum(shape.model %in% c("bi-Gaussian", "Gaussian")) == 0)
- {
+prof.to.features <- function(a,
+ bandwidth = 0.5,
+ min.bw = NA,
+ max.bw = NA,
+ sd.cut = c(0.01, 500),
+ sigma.ratio.lim = c(0.01, 100),
+ shape.model = "bi-Gaussian",
+ estim.method = "moment",
+ do.plot = TRUE,
+ power = 1,
+ component.eliminate = 0.01,
+ BIC.factor = 2) {
+ if (sum(shape.model %in% c("bi-Gaussian", "Gaussian")) == 0) {
print("Error: peak shape model has to be Gaussian or bi-Gaussian")
return(0)
}
- if(sum(estim.method %in% c("moment", "EM")) == 0)
- {
+ if (sum(estim.method %in% c("moment", "EM")) == 0) {
print("Error: peak model estimation method has to be moment or EM")
return(0)
}
-
+
#####################
- bigauss.esti.EM <- function(t, x, max.iter=50, epsilon=0.005, power=1, do.plot=FALSE, truth=NA, sigma.ratio.lim=c(0.3, 1))
- {
-
+ bigauss.esti.EM <- function(t, x, max.iter = 50, epsilon = 0.005, power = 1, do.plot = FALSE, truth = NA, sigma.ratio.lim = c(0.3, 1)) {
+
## function takes into x and t, and then computes the value of
## sigma.1, sigma.2 and a using iterative method. the returned
## values include estimated sigmas, a and a boolean variable on
## whether the termination criteria is satified upon the end of the
## program.
-
- sel<-which(x>1e-10)
- if(length(sel)==0)
- {
- return(c(median(t),1,1,0))
+
+ sel <- which(x > 1e-10)
+ if (length(sel) == 0) {
+ return(c(median(t), 1, 1, 0))
}
- if(length(sel)==1)
- {
- return(c(t[sel], 1,1,0))
+ if (length(sel) == 1) {
+ return(c(t[sel], 1, 1, 0))
}
- t<-t[sel]
- x<-x[sel]
- solve.sigma <- function(x,t,a){
-
+ t <- t[sel]
+ x <- x[sel]
+ solve.sigma <- function(x, t, a) {
+
## this function takes the value intensity level x, retention time t
## and assumed breaking point a, calculates the square estimated of
## sigma.1 and sigma.2.
-
- prep.uv <- function(x,t,a){
-
+
+ prep.uv <- function(x, t, a) {
+
## this function prepares the parameters required for latter
## compuation. u, v, and sum of x.
-
- temp <- (t-a)^2 * x
- u <- sum(temp * as.numeric(t=a))
- return(list(u=u,
- v=v,
- x.sum=sum(x)))
+
+ temp <- (t - a)^2 * x
+ u <- sum(temp * as.numeric(t < a))
+ v <- sum(temp * as.numeric(t >= a))
+ return(list(
+ u = u,
+ v = v,
+ x.sum = sum(x)
+ ))
}
- tt <- prep.uv(x,t,a)
- sigma.1 <- tt$u/tt$x.sum*((tt$v/tt$u)^(1/3)+1)
- sigma.2 <- tt$v/tt$x.sum*((tt$u/tt$v)^(1/3)+1)
- return(list(sigma.1=sigma.1,
- sigma.2=sigma.2))
+ tt <- prep.uv(x, t, a)
+ sigma.1 <- tt$u / tt$x.sum * ((tt$v / tt$u)^(1 / 3) + 1)
+ sigma.2 <- tt$v / tt$x.sum * ((tt$u / tt$v)^(1 / 3) + 1)
+ return(list(
+ sigma.1 = sigma.1,
+ sigma.2 = sigma.2
+ ))
}
-
- solve.a <- function(x,t,a,sigma.1,sigma.2){
-
+
+ solve.a <- function(x, t, a, sigma.1, sigma.2) {
+
## thif function solves the value of a using the x, t, a from the
## previous step, and sigma.1, and sigma.2
-
- w <- x * (as.numeric(t=a)/sigma.2)
- return(sum(t * w)/ sum(w))
+
+ w <- x * (as.numeric(t < a) / sigma.1 + as.numeric(t >= a) / sigma.2)
+ return(sum(t * w) / sum(w))
}
-
+
## epsilon is the threshold for continuing the iteration. change in
## a smaller than epsilon will terminate the iteration.
## epsilon <- min(diff(sort(t)))/2
-
+
## using the median value of t as the initial value of a.
- a.old <- t[which(x==max(x))[1]]
+ a.old <- t[which(x == max(x))[1]]
a.new <- a.old
- change <- 10*epsilon
-
+ change <- 10 * epsilon
+
## n.iter is the number of iteration covered so far.
n.iter <- 0
-
- while((change>epsilon) & (n.iter epsilon) & (n.iter < max.iter)) {
## print(c(n.iter,change))
a.old <- a.new
- n.iter <- n.iter+1
- sigma <- solve.sigma(x,t,a.old)
- if(n.iter == 1) sigma[is.na(sigma)]<-as.numeric(sigma[which(!is.na(sigma))])[1]/10
- a.new <- solve.a(x,t,a.old,sigma$sigma.1,sigma$sigma.2)
- change <- abs(a.old-a.new)
+ n.iter <- n.iter + 1
+ sigma <- solve.sigma(x, t, a.old)
+ if (n.iter == 1) sigma[is.na(sigma)] <- as.numeric(sigma[which(!is.na(sigma))])[1] / 10
+ a.new <- solve.a(x, t, a.old, sigma$sigma.1, sigma$sigma.2)
+ change <- abs(a.old - a.new)
}
# return(list(a=a.new,
# sigma.1=sigma$sigma.1,
# sigma.2=sigma$sigma.2,
# iter.end=(max.iter>n.iter)
# ))
- d<-x
- sigma$sigma.2<-sqrt(sigma$sigma.2)
- sigma$sigma.1<-sqrt(sigma$sigma.1)
-
- d[t=a.new]<-dnorm(t[t>=a.new],mean=a.new,sd=sigma$sigma.2)*sigma$sigma.2
- scale<-exp(sum(d[d>1e-3]^2*log(x[d>1e-3]/d[d>1e-3]))/sum(d[d>1e-3]^2))
+ d <- x
+ sigma$sigma.2 <- sqrt(sigma$sigma.2)
+ sigma$sigma.1 <- sqrt(sigma$sigma.1)
+
+ d[t < a.new] <- dnorm(t[t < a.new], mean = a.new, sd = sigma$sigma.1) * sigma$sigma.1
+ d[t >= a.new] <- dnorm(t[t >= a.new], mean = a.new, sd = sigma$sigma.2) * sigma$sigma.2
+ scale <- exp(sum(d[d > 1e-3]^2 * log(x[d > 1e-3] / d[d > 1e-3])) / sum(d[d > 1e-3]^2))
return(c(a.new, sigma$sigma.1, sigma$sigma.2, scale))
}
-
- bigauss.esti<-function(x,y,power=1, do.plot=FALSE, truth=NA, sigma.ratio.lim=c(0.3, 3))
- {
- sel<-which(y>1e-10)
- if(length(sel)<2)
- {
- to.return<-c(median(x),1,1,0)
- }else{
- x<-x[sel]
- y<-y[sel]
- # sel<-order(x)
- # y<-y[sel]
- # x<-x[sel]
-
- y.0<-y
- if(do.plot) plot(x,y)
- if(do.plot & ! is.na(truth[1]))
- {
- true.y1<-dnorm(x[x=truth[1]],mean=truth[1],sd=truth[3])*truth[3]*truth[4]
- lines(x, c(true.y1,true.y2),col="green")
+
+ bigauss.esti <- function(x, y, power = 1, do.plot = FALSE, truth = NA, sigma.ratio.lim = c(0.3, 3)) {
+ sel <- which(y > 1e-10)
+ if (length(sel) < 2) {
+ to.return <- c(median(x), 1, 1, 0)
+ } else {
+ x <- x[sel]
+ y <- y[sel]
+ # sel<-order(x)
+ # y<-y[sel]
+ # x<-x[sel]
+
+ y.0 <- y
+ if (do.plot) plot(x, y)
+ if (do.plot & !is.na(truth[1])) {
+ true.y1 <- dnorm(x[x < truth[1]], mean = truth[1], sd = truth[2]) * truth[2] * truth[4]
+ true.y2 <- dnorm(x[x >= truth[1]], mean = truth[1], sd = truth[3]) * truth[3] * truth[4]
+ lines(x, c(true.y1, true.y2), col = "green")
}
- max.y.0<-max(y.0,na.rm=TRUE)
- y<-(y/max.y.0)^power
-
- l<-length(x)
- min.d<-min(diff(x))
- dx<-c(x[2]-x[1], (x[3:l]-x[1:(l-2)])/2, x[l]-x[l-1])
- if(l ==2) dx=rep(diff(x),2)
- dx[dx>4*min.d]<-4*min.d
-
- y.cum<-cumsum(y*dx)
- x.y.cum<-cumsum(y*x*dx)
- xsqr.y.cum<-cumsum(y*x^2*dx)
-
- y.cum.rev<-cumsum((y*dx)[l:1])[l:1] # reverse cum sum
- x.y.cum.rev<-cumsum((x*y*dx)[l:1])[l:1]
- xsqr.y.cum.rev<-cumsum((y*x^2*dx)[l:1])[l:1]
-
- sel<-which(y.cum >= sigma.ratio.lim[1]/(sigma.ratio.lim[1] + 1)*y.cum[l])
- if(length(sel)>0)
- {
- start<-max(1, min(sel))
- }else{
- start<-1
+ max.y.0 <- max(y.0, na.rm = TRUE)
+ y <- (y / max.y.0)^power
+
+ l <- length(x)
+ min.d <- min(diff(x))
+ dx <- c(x[2] - x[1], (x[3:l] - x[1:(l - 2)]) / 2, x[l] - x[l - 1])
+ if (l == 2) dx <- rep(diff(x), 2)
+ dx[dx > 4 * min.d] <- 4 * min.d
+
+ y.cum <- cumsum(y * dx)
+ x.y.cum <- cumsum(y * x * dx)
+ xsqr.y.cum <- cumsum(y * x^2 * dx)
+
+ y.cum.rev <- cumsum((y * dx)[l:1])[l:1] # reverse cum sum
+ x.y.cum.rev <- cumsum((x * y * dx)[l:1])[l:1]
+ xsqr.y.cum.rev <- cumsum((y * x^2 * dx)[l:1])[l:1]
+
+ sel <- which(y.cum >= sigma.ratio.lim[1] / (sigma.ratio.lim[1] + 1) * y.cum[l])
+ if (length(sel) > 0) {
+ start <- max(1, min(sel))
+ } else {
+ start <- 1
}
- sel<-which(y.cum <= sigma.ratio.lim[2]/(sigma.ratio.lim[2] + 1)*y.cum[l])
- if(length(sel)>0)
- {
- end<-min(l-1, max(sel))
- }else{
- end<-l-1
+ sel <- which(y.cum <= sigma.ratio.lim[2] / (sigma.ratio.lim[2] + 1) * y.cum[l])
+ if (length(sel) > 0) {
+ end <- min(l - 1, max(sel))
+ } else {
+ end <- l - 1
}
- if(end <= start)
- {
- m<-min(mean(x[start:end]), x[max(which(y.cum.rev>0))])
- }else{
- m.candi<-x[start:end]+diff(x[start:(end+1)])/2
- rec<-matrix(0, ncol=3, nrow=end-start+1)
-
- s1<-sqrt((xsqr.y.cum[start:end]+m.candi^2*y.cum[start:end]-2*m.candi*x.y.cum[start:end])/y.cum[start:end])
- s2<-sqrt((xsqr.y.cum.rev[start:end+1]+m.candi^2*y.cum.rev[start:end+1]-2*m.candi*x.y.cum.rev[start:end+1])/y.cum.rev[start:end+1])
- rec[,1]<-s1
- rec[,2]<-s2
- rec[,3]<-y.cum[start:end]/y.cum.rev[start:end+1]
-
- d<-log(rec[,1]/rec[,2])-log(rec[,3])
- if(min(d,na.rm=TRUE)*max(d,na.rm=TRUE) < 0)
- {
- sel<-c(which(d==max(d[d<0]))[1], which(d==min(d[d>=0])))
- m<-(sum(abs(d[sel])*m.candi[sel]))/(sum(abs(d[sel])))
- }else{
- d<-abs(d)
- m<-m.candi[which(d==min(d,na.rm=TRUE))[1]]
+ if (end <= start) {
+ m <- min(mean(x[start:end]), x[max(which(y.cum.rev > 0))])
+ } else {
+ m.candi <- x[start:end] + diff(x[start:(end + 1)]) / 2
+ rec <- matrix(0, ncol = 3, nrow = end - start + 1)
+
+ s1 <- sqrt((xsqr.y.cum[start:end] + m.candi^2 * y.cum[start:end] - 2 * m.candi * x.y.cum[start:end]) / y.cum[start:end])
+ s2 <- sqrt((xsqr.y.cum.rev[start:end + 1] + m.candi^2 * y.cum.rev[start:end + 1] - 2 * m.candi * x.y.cum.rev[start:end + 1]) / y.cum.rev[start:end + 1])
+ rec[, 1] <- s1
+ rec[, 2] <- s2
+ rec[, 3] <- y.cum[start:end] / y.cum.rev[start:end + 1]
+
+ d <- log(rec[, 1] / rec[, 2]) - log(rec[, 3])
+ if (min(d, na.rm = TRUE) * max(d, na.rm = TRUE) < 0) {
+ sel <- c(which(d == max(d[d < 0]))[1], which(d == min(d[d >= 0])))
+ m <- (sum(abs(d[sel]) * m.candi[sel])) / (sum(abs(d[sel])))
+ } else {
+ d <- abs(d)
+ m <- m.candi[which(d == min(d, na.rm = TRUE))[1]]
}
}
-
- if(do.plot) abline(v=m)
-
- sel1<-which(x=m)
- s1<-sqrt(sum((x[sel1]-m)^2*y[sel1]*dx[sel1])/sum(y[sel1]*dx[sel1]))
- s2<-sqrt(sum((x[sel2]-m)^2*y[sel2]*dx[sel2])/sum(y[sel2]*dx[sel2]))
-
-
- if(power != 1)
- {
- s1<-s1*sqrt(power)
- s2<-s2*sqrt(power)
+
+ if (do.plot) abline(v = m)
+
+ sel1 <- which(x < m)
+ sel2 <- which(x >= m)
+ s1 <- sqrt(sum((x[sel1] - m)^2 * y[sel1] * dx[sel1]) / sum(y[sel1] * dx[sel1]))
+ s2 <- sqrt(sum((x[sel2] - m)^2 * y[sel2] * dx[sel2]) / sum(y[sel2] * dx[sel2]))
+
+
+ if (power != 1) {
+ s1 <- s1 * sqrt(power)
+ s2 <- s2 * sqrt(power)
}
-
- d1<-dnorm(x[sel1], sd=s1, mean=m)
- d2<-dnorm(x[sel2], sd=s2, mean=m)
- d<-c(d1*s1,d2*s2) # notice this "density" doesnt integrate to 1. Rather it integrates to (s1+s2)/2
- y<-y.0
-
- dy.ratio<-d^2*log(y/d)
- dy.ratio[is.na(dy.ratio)]<-0
- dy.ratio[dy.ratio == -Inf]<-0
- dy.ratio[dy.ratio == Inf]<-0
-
-
- scale<-exp(sum(dy.ratio)/sum(d^2))
-
- if(do.plot)
- {
- fit.1<-d*scale
- lines(x[y>0],fit.1,col="red")
+
+ d1 <- dnorm(x[sel1], sd = s1, mean = m)
+ d2 <- dnorm(x[sel2], sd = s2, mean = m)
+ d <- c(d1 * s1, d2 * s2) # notice this "density" doesnt integrate to 1. Rather it integrates to (s1+s2)/2
+ y <- y.0
+
+ dy.ratio <- d^2 * log(y / d)
+ dy.ratio[is.na(dy.ratio)] <- 0
+ dy.ratio[dy.ratio == -Inf] <- 0
+ dy.ratio[dy.ratio == Inf] <- 0
+
+
+ scale <- exp(sum(dy.ratio) / sum(d^2))
+
+ if (do.plot) {
+ fit.1 <- d * scale
+ lines(x[y > 0], fit.1, col = "red")
}
-
- to.return<-c(m,s1,s2,scale)
- if(sum(is.na(to.return)) > 0)
- {
- m<-sum(x*y)/sum(y)
- s1<-s2<-sum(y*(x-m)^2)/sum(y)
- scale<-sum(y)/s1
- to.return<-c(m,s1,s2,scale)
+
+ to.return <- c(m, s1, s2, scale)
+ if (sum(is.na(to.return)) > 0) {
+ m <- sum(x * y) / sum(y)
+ s1 <- s2 <- sum(y * (x - m)^2) / sum(y)
+ scale <- sum(y) / s1
+ to.return <- c(m, s1, s2, scale)
}
}
return(to.return)
}
-
+
##############
- bigauss.mix<-function(x,y,power=1, do.plot=FALSE, sigma.ratio.lim=c(0.1, 10), bw=c(15,30,60), eliminate=.05, max.iter=25, estim.method, BIC.factor=2)
- {
- all.bw<-bw[order(bw)]
-
- x.0<-x
- y.0<-y
-
- sel<-y>1e-5
- x<-x[sel]
- y<-y[sel]
- sel<-order(x)
- y<-y[sel]
- x<-x[sel]
- results<-new("list")
- smoother.pk.rec<-smoother.vly.rec<-new("list")
- bic.rec<-all.bw
-
- if(do.plot)
- {
- par(mfrow=c(ceiling(length(all.bw)/2),2))
- par(mar=c(1,1,1,1))
+ bigauss.mix <- function(x, y, power = 1, do.plot = FALSE, sigma.ratio.lim = c(0.1, 10), bw = c(15, 30, 60), eliminate = .05, max.iter = 25, estim.method, BIC.factor = 2) {
+ all.bw <- bw[order(bw)]
+
+ x.0 <- x
+ y.0 <- y
+
+ sel <- y > 1e-5
+ x <- x[sel]
+ y <- y[sel]
+ sel <- order(x)
+ y <- y[sel]
+ x <- x[sel]
+ results <- new("list")
+ smoother.pk.rec <- smoother.vly.rec <- new("list")
+ bic.rec <- all.bw
+
+ if (do.plot) {
+ par(mfrow = c(ceiling(length(all.bw) / 2), 2))
+ par(mar = c(1, 1, 1, 1))
}
-
- last.num.pks<-Inf
-
- for(bw.n in length(all.bw):1)
+
+ last.num.pks <- Inf
+
+ for (bw.n in length(all.bw):1)
{
- bw<-all.bw[bw.n]
- this.smooth<-ksmooth(x.0,y.0, kernel="normal", bandwidth=bw)
- turns<-find.turn.point(this.smooth$y)
- pks<-this.smooth$x[turns$pks]
- vlys<-c(-Inf, this.smooth$x[turns$vlys], Inf)
-
- smoother.pk.rec[[bw.n]]<-pks
- smoother.vly.rec[[bw.n]]<-vlys
- if(length(pks) != last.num.pks)
- {
- last.num.pks<-length(pks)
- l<-length(x)
- dx<-c(x[2]-x[1], (x[3:l]-x[1:(l-2)])/2, x[l]-x[l-1])
- if(l ==2) dx=rep(diff(x),2)
-
+ bw <- all.bw[bw.n]
+ this.smooth <- ksmooth(x.0, y.0, kernel = "normal", bandwidth = bw)
+ turns <- find.turn.point(this.smooth$y)
+ pks <- this.smooth$x[turns$pks]
+ vlys <- c(-Inf, this.smooth$x[turns$vlys], Inf)
+
+ smoother.pk.rec[[bw.n]] <- pks
+ smoother.vly.rec[[bw.n]] <- vlys
+ if (length(pks) != last.num.pks) {
+ last.num.pks <- length(pks)
+ l <- length(x)
+ dx <- c(x[2] - x[1], (x[3:l] - x[1:(l - 2)]) / 2, x[l] - x[l - 1])
+ if (l == 2) dx <- rep(diff(x), 2)
+
# initiation
- m<-s1<-s2<-delta<-pks
- for(i in 1:length(m))
+ m <- s1 <- s2 <- delta <- pks
+ for (i in 1:length(m))
{
- sel.1<-which(x >= max(vlys[vlys < m[i]]) & x < m[i])
- s1[i]<-sqrt(sum((x[sel.1]-m[i])^2 * y[sel.1]*dx[sel.1])/sum(y[sel.1]*dx[sel.1]))
-
- sel.2<-which(x >= m[i] & x < min(vlys[vlys > m[i]]))
- s2[i]<-sqrt(sum((x[sel.2]-m[i])^2 * y[sel.2] * dx[sel.2])/sum(y[sel.2]*dx[sel.2]))
-
- delta[i]<-(sum(y[sel.1]*dx[sel.1]) + sum(y[sel.2]*dx[sel.2]))/((sum(dnorm(x[sel.1], mean=m[i], sd=s1[i])) * s1[i] /2)+(sum(dnorm(x[sel.2], mean=m[i], sd=s2[i])) * s2[i] /2))
+ sel.1 <- which(x >= max(vlys[vlys < m[i]]) & x < m[i])
+ s1[i] <- sqrt(sum((x[sel.1] - m[i])^2 * y[sel.1] * dx[sel.1]) / sum(y[sel.1] * dx[sel.1]))
+
+ sel.2 <- which(x >= m[i] & x < min(vlys[vlys > m[i]]))
+ s2[i] <- sqrt(sum((x[sel.2] - m[i])^2 * y[sel.2] * dx[sel.2]) / sum(y[sel.2] * dx[sel.2]))
+
+ delta[i] <- (sum(y[sel.1] * dx[sel.1]) + sum(y[sel.2] * dx[sel.2])) / ((sum(dnorm(x[sel.1], mean = m[i], sd = s1[i])) * s1[i] / 2) + (sum(dnorm(x[sel.2], mean = m[i], sd = s2[i])) * s2[i] / 2))
}
- delta[is.na(delta)]<-1e-10
- s1[is.na(s1)]<-1e-10
- s2[is.na(s2)]<-1e-10
-
-
- fit<-matrix(0,ncol=length(m), nrow=length(x)) # this is the matrix of fitted values
-
- this.change=Inf
- counter=0
-
- while(this.change > 0.1 & counter <= max.iter)
- {
- counter<-counter+1
- old.m<-m
-
+ delta[is.na(delta)] <- 1e-10
+ s1[is.na(s1)] <- 1e-10
+ s2[is.na(s2)] <- 1e-10
+
+
+ fit <- matrix(0, ncol = length(m), nrow = length(x)) # this is the matrix of fitted values
+
+ this.change <- Inf
+ counter <- 0
+
+ while (this.change > 0.1 & counter <= max.iter) {
+ counter <- counter + 1
+ old.m <- m
+
# E step
- cuts<-c(-Inf, m, Inf)
- for(j in 2:length(cuts))
+ cuts <- c(-Inf, m, Inf)
+ for (j in 2:length(cuts))
{
- sel<-which(x >= cuts[j-1] & x < cuts[j])
- use.s1<-which(1:length(m) >= (j-1))
- s.to.use<-s2
- s.to.use[use.s1]<-s1[use.s1]
- for(i in 1:ncol(fit))
+ sel <- which(x >= cuts[j - 1] & x < cuts[j])
+ use.s1 <- which(1:length(m) >= (j - 1))
+ s.to.use <- s2
+ s.to.use[use.s1] <- s1[use.s1]
+ for (i in 1:ncol(fit))
{
- fit[sel,i]<-dnorm(x[sel], mean=m[i], sd = s.to.use[i]) * s.to.use[i] *delta[i]
+ fit[sel, i] <- dnorm(x[sel], mean = m[i], sd = s.to.use[i]) * s.to.use[i] * delta[i]
}
}
- fit[is.na(fit)]<-0
- sum.fit<-apply(fit, 1, sum)
-
+ fit[is.na(fit)] <- 0
+ sum.fit <- apply(fit, 1, sum)
+
# Elimination step
- fit<-fit/sum.fit
- fit2<-fit * y
- perc.explained<-apply(fit2,2,sum)/sum(y)
- max.erase<-max(1, round(length(perc.explained)/5))
-
- to.erase<-which(perc.explained <= min(eliminate, perc.explained[order(perc.explained, na.last=FALSE)[max.erase]]))
-
-
- if(length(to.erase) > 0)
- {
- m<-m[-to.erase]
- s1<-s1[-to.erase]
- s2<-s2[-to.erase]
- delta<-delta[-to.erase]
- fit<-fit[,-to.erase]
- if(is.null(ncol(fit))) fit<-matrix(fit, ncol=1)
- sum.fit<-apply(fit, 1, sum)
- fit<-fit/sum.fit
- old.m<-old.m[-to.erase]
+ fit <- fit / sum.fit
+ fit2 <- fit * y
+ perc.explained <- apply(fit2, 2, sum) / sum(y)
+ max.erase <- max(1, round(length(perc.explained) / 5))
+
+ to.erase <- which(perc.explained <= min(eliminate, perc.explained[order(perc.explained, na.last = FALSE)[max.erase]]))
+
+
+ if (length(to.erase) > 0) {
+ m <- m[-to.erase]
+ s1 <- s1[-to.erase]
+ s2 <- s2[-to.erase]
+ delta <- delta[-to.erase]
+ fit <- fit[, -to.erase]
+ if (is.null(ncol(fit))) fit <- matrix(fit, ncol = 1)
+ sum.fit <- apply(fit, 1, sum)
+ fit <- fit / sum.fit
+ old.m <- old.m[-to.erase]
}
-
+
# M step
- for(i in 1:length(m))
+ for (i in 1:length(m))
{
- this.y<-y * fit[,i]
- if(estim.method=="moment")
- {
- this.fit<-bigauss.esti(x,this.y,power=power,do.plot=FALSE, sigma.ratio.lim=sigma.ratio.lim)
- }else{
- this.fit<-bigauss.esti.EM(x,this.y,power=power,do.plot=FALSE, sigma.ratio.lim=sigma.ratio.lim)
+ this.y <- y * fit[, i]
+ if (estim.method == "moment") {
+ this.fit <- bigauss.esti(x, this.y, power = power, do.plot = FALSE, sigma.ratio.lim = sigma.ratio.lim)
+ } else {
+ this.fit <- bigauss.esti.EM(x, this.y, power = power, do.plot = FALSE, sigma.ratio.lim = sigma.ratio.lim)
}
- m[i]<-this.fit[1]
- s1[i]<-this.fit[2]
- s2[i]<-this.fit[3]
- delta[i]<-this.fit[4]
+ m[i] <- this.fit[1]
+ s1[i] <- this.fit[2]
+ s2[i] <- this.fit[3]
+ delta[i] <- this.fit[4]
}
- delta[is.na(delta)]<-0
-
- #amount of change
- this.change<-sum((old.m-m)^2)
+ delta[is.na(delta)] <- 0
+
+ # amount of change
+ this.change <- sum((old.m - m)^2)
}
- cuts<-c(-Inf, m, Inf)
- fit<-fit*0
- for(j in 2:length(cuts))
+ cuts <- c(-Inf, m, Inf)
+ fit <- fit * 0
+ for (j in 2:length(cuts))
{
- sel<-which(x >= cuts[j-1] & x < cuts[j])
- use.s1<-which(1:length(m) >= (j-1))
- s.to.use<-s2
- s.to.use[use.s1]<-s1[use.s1]
- for(i in 1:ncol(fit))
+ sel <- which(x >= cuts[j - 1] & x < cuts[j])
+ use.s1 <- which(1:length(m) >= (j - 1))
+ s.to.use <- s2
+ s.to.use[use.s1] <- s1[use.s1]
+ for (i in 1:ncol(fit))
{
- if(s.to.use[i] != 0)
- {
- fit[sel,i]<-dnorm(x[sel], mean=m[i], sd = s.to.use[i]) * s.to.use[i] *delta[i]
+ if (s.to.use[i] != 0) {
+ fit[sel, i] <- dnorm(x[sel], mean = m[i], sd = s.to.use[i]) * s.to.use[i] * delta[i]
}
}
}
-
- if(do.plot)
- {
- plot(x,y,cex=.1,main=paste("bw=",bw))
- sum.fit<-apply(fit, 1, sum)
- lines(x,sum.fit)
- abline(v=m)
- cols<-c("red","green","blue","cyan","brown","black",rep("grey",100))
- for(i in 1:length(m))
+
+ if (do.plot) {
+ plot(x, y, cex = .1, main = paste("bw=", bw))
+ sum.fit <- apply(fit, 1, sum)
+ lines(x, sum.fit)
+ abline(v = m)
+ cols <- c("red", "green", "blue", "cyan", "brown", "black", rep("grey", 100))
+ for (i in 1:length(m))
{
- lines(x, fit[,i],col=cols[i])
+ lines(x, fit[, i], col = cols[i])
}
}
- area<-delta*(s1+s2)/2
- rss<-sum((y-apply(fit,1,sum))^2)
- l<-length(x)
- bic<-l*log(rss/l)+4*length(m)*log(l)*BIC.factor
- results[[bw.n]]<-cbind(m,s1,s2,delta,area)
- bic.rec[bw.n]<-bic
- }else{
- results[[bw.n]]<-NA
- bic.rec[bw.n]<-Inf
- results[[bw.n]]<-results[[bw.n+1]]
-
+ area <- delta * (s1 + s2) / 2
+ rss <- sum((y - apply(fit, 1, sum))^2)
+ l <- length(x)
+ bic <- l * log(rss / l) + 4 * length(m) * log(l) * BIC.factor
+ results[[bw.n]] <- cbind(m, s1, s2, delta, area)
+ bic.rec[bw.n] <- bic
+ } else {
+ results[[bw.n]] <- NA
+ bic.rec[bw.n] <- Inf
+ results[[bw.n]] <- results[[bw.n + 1]]
}
}
- sel<-which(bic.rec == min(bic.rec, na.rm=TRUE))
- if(length(sel) > 1) sel<-sel[which(all.bw[sel]==max(all.bw[sel]))]
- rec<-new("list")
- rec$param<-results[[sel]]
- rec$smoother.pks<-smoother.pk.rec
- rec$smoother.vlys<-smoother.vly.rec
- rec$all.param<-results
- rec$bic<-bic.rec
+ sel <- which(bic.rec == min(bic.rec, na.rm = TRUE))
+ if (length(sel) > 1) sel <- sel[which(all.bw[sel] == max(all.bw[sel]))]
+ rec <- new("list")
+ rec$param <- results[[sel]]
+ rec$smoother.pks <- smoother.pk.rec
+ rec$smoother.vlys <- smoother.vly.rec
+ rec$all.param <- results
+ rec$bic <- bic.rec
return(rec)
}
-
+
#############
- normix<-function(that.curve, pks, vlys, ignore=0.1, max.iter=50, prob.cut=1e-2)
- {
- x<-that.curve[,1]
- y<-that.curve[,2]
-
- if(length(pks)==1)
- {
- miu<-sum(x*y)/sum(y)
- sigma<-sqrt(sum(y*(x-miu)^2)/sum(y))
- fitted<-dnorm(x, mean=miu, sd=sigma)
- this.sel<-y>0 & fitted/dnorm(miu,mean=miu,sd=sigma)>prob.cut
- sc<-exp(sum(fitted[this.sel]^2*log(y[this.sel]/fitted[this.sel])/sum(fitted[this.sel]^2)))
- }else{
- pks<-pks[order(pks)]
- vlys<-vlys[order(vlys)]
- l<-length(pks)
- miu<-sigma<-sc<-pks
- w<-matrix(0,nrow=l, ncol=length(x))
-
- for(m in 1:l)
+ normix <- function(that.curve, pks, vlys, ignore = 0.1, max.iter = 50, prob.cut = 1e-2) {
+ x <- that.curve[, 1]
+ y <- that.curve[, 2]
+
+ if (length(pks) == 1) {
+ miu <- sum(x * y) / sum(y)
+ sigma <- sqrt(sum(y * (x - miu)^2) / sum(y))
+ fitted <- dnorm(x, mean = miu, sd = sigma)
+ this.sel <- y > 0 & fitted / dnorm(miu, mean = miu, sd = sigma) > prob.cut
+ sc <- exp(sum(fitted[this.sel]^2 * log(y[this.sel] / fitted[this.sel]) / sum(fitted[this.sel]^2)))
+ } else {
+ pks <- pks[order(pks)]
+ vlys <- vlys[order(vlys)]
+ l <- length(pks)
+ miu <- sigma <- sc <- pks
+ w <- matrix(0, nrow = l, ncol = length(x))
+
+ for (m in 1:l)
{
- this.low<-max(vlys[vlys<=pks[m]])
- this.high<-min(vlys[vlys>=pks[m]])
- this.x<-x[x>=this.low & x<=this.high]
- this.y<-y[x>=this.low & x<=this.high]
-
- miu[m]<-sum(this.x*this.y)/sum(this.y)
- #if(sum(this.y>0) > 1)
- #{
- sigma[m]<-sqrt(sum(this.y*(this.x-miu[m])^2)/sum(this.y))
- #}else{
+ this.low <- max(vlys[vlys <= pks[m]])
+ this.high <- min(vlys[vlys >= pks[m]])
+ this.x <- x[x >= this.low & x <= this.high]
+ this.y <- y[x >= this.low & x <= this.high]
+
+ miu[m] <- sum(this.x * this.y) / sum(this.y)
+ # if(sum(this.y>0) > 1)
+ # {
+ sigma[m] <- sqrt(sum(this.y * (this.x - miu[m])^2) / sum(this.y))
+ # }else{
# sigma[m]<-mean(diff(this.x))/2
- #}
- fitted<-dnorm(this.x, mean=miu[m], sd=sigma[m])
- this.sel<-this.y>0 & fitted/dnorm(miu[m],mean=miu[m],sd=sigma[m]) > prob.cut
- sc[m]<-exp(sum(fitted[this.sel]^2*log(this.y[this.sel]/fitted[this.sel])/sum(fitted[this.sel]^2)))
- #sc[m]<-lm(this.y[this.sel]~fitted[this.sel]+0)$coef
+ # }
+ fitted <- dnorm(this.x, mean = miu[m], sd = sigma[m])
+ this.sel <- this.y > 0 & fitted / dnorm(miu[m], mean = miu[m], sd = sigma[m]) > prob.cut
+ sc[m] <- exp(sum(fitted[this.sel]^2 * log(this.y[this.sel] / fitted[this.sel]) / sum(fitted[this.sel]^2)))
+ # sc[m]<-lm(this.y[this.sel]~fitted[this.sel]+0)$coef
}
-
- to.erase<-which(is.na(miu) | is.na(sigma) | sigma==0 | is.na(sc))
- if(length(to.erase)>0)
- {
- l<-l-length(to.erase)
- miu<-miu[-to.erase]
- sigma<-sigma[-to.erase]
- sc<-sc[-to.erase]
- w<-w[-to.erase,]
+
+ to.erase <- which(is.na(miu) | is.na(sigma) | sigma == 0 | is.na(sc))
+ if (length(to.erase) > 0) {
+ l <- l - length(to.erase)
+ miu <- miu[-to.erase]
+ sigma <- sigma[-to.erase]
+ sc <- sc[-to.erase]
+ w <- w[-to.erase, ]
}
-
- direc<-1
- diff<-1000
- iter<-0
-
- while(diff>0.05 & iter0 & fitted/dnorm(miu,mean=miu,sd=sigma) > prob.cut
- sc<-exp(sum(fitted[this.sel]^2*log(y[this.sel]/fitted[this.sel])/sum(fitted[this.sel]^2)))
- #sc<-lm(y[this.sel]~fitted[this.sel]+0)$coef
- break;
+
+ direc <- 1
+ diff <- 1000
+ iter <- 0
+
+ while (diff > 0.05 & iter < max.iter) {
+ iter <- iter + 1
+ if (l == 1) {
+ miu <- sum(x * y) / sum(y)
+ sigma <- sqrt(sum(y * (x - miu)^2) / sum(y))
+ fitted <- dnorm(x, mean = miu, sd = sigma)
+ this.sel <- y > 0 & fitted / dnorm(miu, mean = miu, sd = sigma) > prob.cut
+ sc <- exp(sum(fitted[this.sel]^2 * log(y[this.sel] / fitted[this.sel]) / sum(fitted[this.sel]^2)))
+ # sc<-lm(y[this.sel]~fitted[this.sel]+0)$coef
+ break
}
- miu.0<-miu
- sigma.0<-sigma
- sc.0<-sc
-
- all.w<-y*0
- for(m in 1:l)
+ miu.0 <- miu
+ sigma.0 <- sigma
+ sc.0 <- sc
+
+ all.w <- y * 0
+ for (m in 1:l)
{
- all.w<-all.w+dnorm(x,mean=miu[m],sd=sigma[m])*sc[m]
+ all.w <- all.w + dnorm(x, mean = miu[m], sd = sigma[m]) * sc[m]
}
-
- for(m in 1:l)
+
+ for (m in 1:l)
{
- w[m,]<-dnorm(x,mean=miu[m],sd=sigma[m])*sc[m]/all.w
+ w[m, ] <- dnorm(x, mean = miu[m], sd = sigma[m]) * sc[m] / all.w
}
-
- if(sum(is.na(w)) >0) break;
-
- for(m in 1:l)
+
+ if (sum(is.na(w)) > 0) break
+
+ for (m in 1:l)
{
- this.y<-y*w[m,]
- miu[m]<-sum(x*this.y)/sum(this.y)
- sigma[m]<-sqrt(sum(this.y*(x-miu[m])^2)/sum(this.y))
- fitted<-dnorm(x,mean=miu[m],sd=sigma[m])
- this.sel<-this.y>0 & fitted/dnorm(miu[m],mean=miu[m],sd=sigma[m])> prob.cut
- sc[m]<-exp(sum(fitted[this.sel]^2*log(this.y[this.sel]/fitted[this.sel])/sum(fitted[this.sel]^2)))
- #sc[m]<-lm(this.y[this.sel]~fitted[this.sel]+0)$coef
+ this.y <- y * w[m, ]
+ miu[m] <- sum(x * this.y) / sum(this.y)
+ sigma[m] <- sqrt(sum(this.y * (x - miu[m])^2) / sum(this.y))
+ fitted <- dnorm(x, mean = miu[m], sd = sigma[m])
+ this.sel <- this.y > 0 & fitted / dnorm(miu[m], mean = miu[m], sd = sigma[m]) > prob.cut
+ sc[m] <- exp(sum(fitted[this.sel]^2 * log(this.y[this.sel] / fitted[this.sel]) / sum(fitted[this.sel]^2)))
+ # sc[m]<-lm(this.y[this.sel]~fitted[this.sel]+0)$coef
}
- diff<-sum((miu.0-miu)^2)
-
- www<-w
- for(m in 1:l)
+ diff <- sum((miu.0 - miu)^2)
+
+ www <- w
+ for (m in 1:l)
{
- www[m,]<-www[m,]*y
+ www[m, ] <- www[m, ] * y
}
- www<-apply(www,1,sum)
- www[which(is.na(sc))]<-0
- www<-www/sum(www)
- max.erase<-max(1, round(l/5))
-
- to.erase<-which(www <= min(ignore, www[order(www, na.last=FALSE)[max.erase]]))
-
- if(length(to.erase)>0)
- {
- l<-l-length(to.erase)
- miu<-miu[-to.erase]
- sigma<-sigma[-to.erase]
- sc<-sc[-to.erase]
- w<-w[-to.erase,]
- diff<-1000
+ www <- apply(www, 1, sum)
+ www[which(is.na(sc))] <- 0
+ www <- www / sum(www)
+ max.erase <- max(1, round(l / 5))
+
+ to.erase <- which(www <= min(ignore, www[order(www, na.last = FALSE)[max.erase]]))
+
+ if (length(to.erase) > 0) {
+ l <- l - length(to.erase)
+ miu <- miu[-to.erase]
+ sigma <- sigma[-to.erase]
+ sc <- sc[-to.erase]
+ w <- w[-to.erase, ]
+ diff <- 1000
}
}
}
- l<-length(miu)
- if(l==1)
- {
- rec<-matrix(c(miu, sigma, sc),nrow=1)
- }else{
- rec<-cbind(miu,sigma,sc)
+ l <- length(miu)
+ if (l == 1) {
+ rec <- matrix(c(miu, sigma, sc), nrow = 1)
+ } else {
+ rec <- cbind(miu, sigma, sc)
}
- colnames(rec)<-c("miu","sigma","scale")
+ colnames(rec) <- c("miu", "sigma", "scale")
return(rec)
}
-
+
##########
-
- normix.bic<-function(x,y,power=2, do.plot=FALSE, bw=c(15,30,60), eliminate=.05, max.iter=50, BIC.factor=2)
- {
- all.bw<-bw[order(bw)]
- sel<-y>1e-5
- x<-x[sel]
- y<-y[sel]
- sel<-order(x)
- y<-y[sel]
- x<-x[sel]
- results<-new("list")
- smoother.pk.rec<-smoother.vly.rec<-new("list")
- bic.rec<-all.bw
-
- if(do.plot)
- {
- par(mfrow=c(ceiling(length(all.bw)/2),2))
- par(mar=c(1,1,1,1))
+
+ normix.bic <- function(x, y, power = 2, do.plot = FALSE, bw = c(15, 30, 60), eliminate = .05, max.iter = 50, BIC.factor = 2) {
+ all.bw <- bw[order(bw)]
+ sel <- y > 1e-5
+ x <- x[sel]
+ y <- y[sel]
+ sel <- order(x)
+ y <- y[sel]
+ x <- x[sel]
+ results <- new("list")
+ smoother.pk.rec <- smoother.vly.rec <- new("list")
+ bic.rec <- all.bw
+
+ if (do.plot) {
+ par(mfrow = c(ceiling(length(all.bw) / 2), 2))
+ par(mar = c(1, 1, 1, 1))
}
-
- last.num.pks<-Inf
-
- for(bw.n in length(all.bw):1)
+
+ last.num.pks <- Inf
+
+ for (bw.n in length(all.bw):1)
{
- bw<-all.bw[bw.n]
- this.smooth<-ksmooth(x,y, kernel="normal", bandwidth=bw)
- turns<-find.turn.point(this.smooth$y)
- pks<-this.smooth$x[turns$pks]
- vlys<-c(-Inf, this.smooth$x[turns$vlys], Inf)
-
- smoother.pk.rec[[bw.n]]<-pks
- smoother.vly.rec[[bw.n]]<-vlys
- if(length(pks) != last.num.pks)
- {
- last.num.pks<-length(pks)
- aaa<-normix(cbind(x,y), pks=pks, vlys=vlys, ignore=eliminate, max.iter=max.iter)
-
- total.fit<-x*0
- for(i in 1:nrow(aaa))
+ bw <- all.bw[bw.n]
+ this.smooth <- ksmooth(x, y, kernel = "normal", bandwidth = bw)
+ turns <- find.turn.point(this.smooth$y)
+ pks <- this.smooth$x[turns$pks]
+ vlys <- c(-Inf, this.smooth$x[turns$vlys], Inf)
+
+ smoother.pk.rec[[bw.n]] <- pks
+ smoother.vly.rec[[bw.n]] <- vlys
+ if (length(pks) != last.num.pks) {
+ last.num.pks <- length(pks)
+ aaa <- normix(cbind(x, y), pks = pks, vlys = vlys, ignore = eliminate, max.iter = max.iter)
+
+ total.fit <- x * 0
+ for (i in 1:nrow(aaa))
{
- total.fit<-total.fit+dnorm(x,mean=aaa[i,1], sd=aaa[i,2])*aaa[i,3]
+ total.fit <- total.fit + dnorm(x, mean = aaa[i, 1], sd = aaa[i, 2]) * aaa[i, 3]
}
-
- if(do.plot)
- {
- plot(x,y,cex=.1,main=paste("bw=",bw))
- abline(v=aaa[,1])
- cols<-c("red","green","blue","cyan","brown","black",rep("grey",100))
- for(i in 1:nrow(aaa))
+
+ if (do.plot) {
+ plot(x, y, cex = .1, main = paste("bw=", bw))
+ abline(v = aaa[, 1])
+ cols <- c("red", "green", "blue", "cyan", "brown", "black", rep("grey", 100))
+ for (i in 1:nrow(aaa))
{
- lines(x, dnorm(x,mean=aaa[i,1], sd=aaa[i,2])*aaa[i,3],col=cols[i])
+ lines(x, dnorm(x, mean = aaa[i, 1], sd = aaa[i, 2]) * aaa[i, 3], col = cols[i])
}
}
-
- rss<-sum((y-total.fit)^2)
- l<-length(x)
- bic<-l*log(rss/l)+3*nrow(aaa)*log(l)*BIC.factor
- results[[bw.n]]<-aaa
- bic.rec[bw.n]<-bic
- }else{
- bic.rec[bw.n]<-Inf
- results[[bw.n]]<-results[[bw.n+1]]
-
+
+ rss <- sum((y - total.fit)^2)
+ l <- length(x)
+ bic <- l * log(rss / l) + 3 * nrow(aaa) * log(l) * BIC.factor
+ results[[bw.n]] <- aaa
+ bic.rec[bw.n] <- bic
+ } else {
+ bic.rec[bw.n] <- Inf
+ results[[bw.n]] <- results[[bw.n + 1]]
}
}
- sel<-which(bic.rec == min(bic.rec))
- if(length(sel) > 1) sel<-sel[which(all.bw[sel]==max(all.bw[sel]))]
- rec<-new("list")
- rec$param<-results[[sel]]
- rec$smoother.pks<-smoother.pk.rec
- rec$smoother.vlys<-smoother.vly.rec
- rec$all.param<-results
- rec$bic<-bic.rec
+ sel <- which(bic.rec == min(bic.rec))
+ if (length(sel) > 1) sel <- sel[which(all.bw[sel] == max(all.bw[sel]))]
+ rec <- new("list")
+ rec$param <- results[[sel]]
+ rec$smoother.pks <- smoother.pk.rec
+ rec$smoother.vlys <- smoother.vly.rec
+ rec$all.param <- results
+ rec$bic <- bic.rec
return(rec)
}
-
+
##########
-
- if(is.na(min.bw)) min.bw<-diff(range(a[,2], na.rm=TRUE))/60
- if(is.na(max.bw)) max.bw<-diff(range(a[,2], na.rm=TRUE))/15
- if(min.bw >= max.bw) min.bw<-max.bw/4
-
- base.curve<-unique(a[,2])
- all.span<-range(base.curve)
- base.curve<-base.curve[order(base.curve)]
- base.curve<-cbind(base.curve, base.curve*0)
- all.times<-base.curve[,1]
- if(all.times[1]>0) all.times<-c(0,all.times)
- all.times<-c(all.times, 2*all.times[length(all.times)]-all.times[length(all.times)-1])
- all.times<-(all.times[1:(length(all.times)-1)]+all.times[2:length(all.times)])/2
- all.times<-all.times[2:length(all.times)]-all.times[1:(length(all.times)-1)]
-
- this.features<-matrix(0,nrow=1,ncol=5)
- colnames(this.features)<-c("mz","pos","sd1","sd2","area")
- n=1
- start=1
- nrowa<-nrow(a)
-
- a.breaks<-c(0, which(a[1:(nrowa-1),4] != a[2:nrowa,4]), nrowa)
- mz.sd.rec<-NA
-
- for(nnn in 1:(length(a.breaks)-1))
+
+ if (is.na(min.bw)) min.bw <- diff(range(a[, 2], na.rm = TRUE)) / 60
+ if (is.na(max.bw)) max.bw <- diff(range(a[, 2], na.rm = TRUE)) / 15
+ if (min.bw >= max.bw) min.bw <- max.bw / 4
+
+ base.curve <- unique(a[, 2])
+ all.span <- range(base.curve)
+ base.curve <- base.curve[order(base.curve)]
+ base.curve <- cbind(base.curve, base.curve * 0)
+ all.times <- base.curve[, 1]
+ if (all.times[1] > 0) all.times <- c(0, all.times)
+ all.times <- c(all.times, 2 * all.times[length(all.times)] - all.times[length(all.times) - 1])
+ all.times <- (all.times[1:(length(all.times) - 1)] + all.times[2:length(all.times)]) / 2
+ all.times <- all.times[2:length(all.times)] - all.times[1:(length(all.times) - 1)]
+
+ this.features <- matrix(0, nrow = 1, ncol = 5)
+ colnames(this.features) <- c("mz", "pos", "sd1", "sd2", "area")
+ n <- 1
+ start <- 1
+ nrowa <- nrow(a)
+
+ a.breaks <- c(0, which(a[1:(nrowa - 1), 4] != a[2:nrowa, 4]), nrowa)
+ mz.sd.rec <- NA
+
+ for (nnn in 1:(length(a.breaks) - 1))
{
- this<-a[(a.breaks[nnn]+1):a.breaks[nnn+1],]
- if(is.null(nrow(this))) this<-matrix(this, nrow=1)
- this<-this[order(this[,2]),]
- if(is.null(nrow(this))) this<-matrix(this, nrow=1)
- mz.sd.rec<-c(mz.sd.rec, sd(this[,1]))
-
- this.count.1<-nrow(this)
- if(this.count.1<=10)
- {
- if(this.count.1>1)
- {
- this.inte<-interpol.area(this[,2], this[,3], base.curve[,1], all.times)
- xxx<-c(median(this[,1]),median(this[,2]), sd(this[,2]), sd(this[,2]), this.inte)
- }else{
- this.time.weights<-all.times[which(base.curve[,1] %in% this[2])]
- xxx<-c(this[1], this[2], NA, NA, this[3]*this.time.weights)
+ this <- a[(a.breaks[nnn] + 1):a.breaks[nnn + 1], ]
+ if (is.null(nrow(this))) this <- matrix(this, nrow = 1)
+ this <- this[order(this[, 2]), ]
+ if (is.null(nrow(this))) this <- matrix(this, nrow = 1)
+ mz.sd.rec <- c(mz.sd.rec, sd(this[, 1]))
+
+ this.count.1 <- nrow(this)
+ if (this.count.1 <= 10) {
+ if (this.count.1 > 1) {
+ this.inte <- interpol.area(this[, 2], this[, 3], base.curve[, 1], all.times)
+ xxx <- c(median(this[, 1]), median(this[, 2]), sd(this[, 2]), sd(this[, 2]), this.inte)
+ } else {
+ this.time.weights <- all.times[which(base.curve[, 1] %in% this[2])]
+ xxx <- c(this[1], this[2], NA, NA, this[3] * this.time.weights)
}
- this.features<-rbind(this.features,xxx)
- }else{
- this.span<-range(this[,2])
- this.curve<-base.curve[base.curve[,1]>=this.span[1] & base.curve[,1] <=this.span[2],]
- this.curve[this.curve[,1] %in% this[,2],2]<-this[,3]
-
- bw<-min(max(bandwidth*(this.span[2]-this.span[1]),min.bw), max.bw)
- bw<-seq(bw, 2*bw, length.out=3)
- if(bw[1] > 1.5*min.bw) bw<-c(max(min.bw, bw[1]/2), bw)
-
- if(shape.model == "Gaussian")
- {
- xxx<-normix.bic(this.curve[,1], this.curve[,2], power=power, bw=bw, eliminate=component.eliminate, BIC.factor=BIC.factor)$param
- if(nrow(xxx) == 1)
- {
- xxx<-c(xxx[1,1:2],xxx[1,2],xxx[1,3])
- }else{
- xxx<-cbind(xxx[,1:2],xxx[,2],xxx[,3])
+ this.features <- rbind(this.features, xxx)
+ } else {
+ this.span <- range(this[, 2])
+ this.curve <- base.curve[base.curve[, 1] >= this.span[1] & base.curve[, 1] <= this.span[2], ]
+ this.curve[this.curve[, 1] %in% this[, 2], 2] <- this[, 3]
+
+ bw <- min(max(bandwidth * (this.span[2] - this.span[1]), min.bw), max.bw)
+ bw <- seq(bw, 2 * bw, length.out = 3)
+ if (bw[1] > 1.5 * min.bw) bw <- c(max(min.bw, bw[1] / 2), bw)
+
+ if (shape.model == "Gaussian") {
+ xxx <- normix.bic(this.curve[, 1], this.curve[, 2], power = power, bw = bw, eliminate = component.eliminate, BIC.factor = BIC.factor)$param
+ if (nrow(xxx) == 1) {
+ xxx <- c(xxx[1, 1:2], xxx[1, 2], xxx[1, 3])
+ } else {
+ xxx <- cbind(xxx[, 1:2], xxx[, 2], xxx[, 3])
}
- }else{
- xxx<-bigauss.mix(this.curve[,1], this.curve[,2], sigma.ratio.lim=sigma.ratio.lim, bw=bw, power=power, estim.method=estim.method, eliminate=component.eliminate, BIC.factor=BIC.factor)$param[,c(1,2,3,5)]
+ } else {
+ xxx <- bigauss.mix(this.curve[, 1], this.curve[, 2], sigma.ratio.lim = sigma.ratio.lim, bw = bw, power = power, estim.method = estim.method, eliminate = component.eliminate, BIC.factor = BIC.factor)$param[, c(1, 2, 3, 5)]
}
-
- if(is.null(nrow(xxx)))
- {
- this.features<-rbind(this.features,c(median(this[,1]),xxx))
- }else{
- for(m in 1:nrow(xxx))
+
+ if (is.null(nrow(xxx))) {
+ this.features <- rbind(this.features, c(median(this[, 1]), xxx))
+ } else {
+ for (m in 1:nrow(xxx))
{
- this.d<-abs(this[,2]-xxx[m,1])
- this.features<-rbind(this.features, c(mean(this[which(this.d==min(this.d)),1]),xxx[m,]))
+ this.d <- abs(this[, 2] - xxx[m, 1])
+ this.features <- rbind(this.features, c(mean(this[which(this.d == min(this.d)), 1]), xxx[m, ]))
}
}
}
-
- #message(nnn)
+
+ # message(nnn)
}
- this.features<-this.features[-1,]
- this.features<-this.features[order(this.features[,1], this.features[,2]),]
- this.features<-this.features[which(apply(this.features[,3:4],1,min)>sd.cut[1] & apply(this.features[,3:4],1,max) sd.cut[1] & apply(this.features[, 3:4], 1, max) < sd.cut[2]), ]
+ rownames(this.features) <- NULL
+
+ if (do.plot) {
+ par(mfrow = c(2, 2))
+ plot(c(-1, 1), c(-1, 1), type = "n", xlab = "", ylab = "", main = "", axes = FALSE)
+ text(x = 0, y = 0, "Estimate peak \n area/location", cex = 1.5)
+ hist(mz.sd.rec, xlab = "m/z SD", ylab = "Frequency", main = "m/z SD distribution")
+ hist(c(this.features[, 3], this.features[, 4]), xlab = "Retention time SD", ylab = "Frequency", main = "Retention time SD distribution")
+ hist(log10(this.features[, 5]), xlab = "peak strength (log scale)", ylab = "Frequency", main = "Peak strength distribution")
}
-
+
return(this.features)
}
diff --git a/R/unsupervised.R b/R/unsupervised.R
index 2de6b8b3..61b2aabf 100644
--- a/R/unsupervised.R
+++ b/R/unsupervised.R
@@ -48,64 +48,6 @@ sort_samples_by_acquisition_number <- function (filenames) {
sort(unlist(filenames))
}
-extract_features <- function(
- cluster,
- filenames,
- min_pres,
- min_run,
- mz_tol,
- baseline_correct,
- baseline_correct_noise_percentile,
- intensity_weighted,
- min_bandwidth,
- max_bandwidth,
- sd_cut,
- sigma_ratio_lim,
- shape_model,
- peak_estim_method,
- component_eliminate,
- moment_power,
- BIC_factor
-) {
- clusterExport(cluster, list(
- 'proc.cdf',
- 'prof.to.features',
- 'load.lcms',
- 'adaptive.bin',
- 'find.turn.point',
- 'combine.seq.3',
- 'cont.index',
- 'interpol.area'
- ))
-
- parLapply(cluster, filenames, function(filename) {
- profile <- proc.cdf(
- filename = filename,
- min.pres = min_pres,
- min.run = min_run,
- tol = mz_tol,
- baseline.correct = baseline_correct,
- baseline.correct.noise.percentile = baseline_correct_noise_percentile,
- intensity.weighted = intensity_weighted,
- do.plot = FALSE,
- cache = FALSE
- )
- features <- prof.to.features(
- a = profile,
- min.bw = min_bandwidth,
- max.bw = max_bandwidth,
- sd.cut = sd_cut,
- sigma.ratio.lim = sigma_ratio_lim,
- shape.model = shape_model,
- estim.method = peak_estim_method,
- component.eliminate = component_eliminate,
- power = moment_power,
- BIC.factor = BIC_factor,
- do.plot = FALSE
- )
- })
-}
-
align_features <- function(sample_names, ...) {
aligned <- feature.align(...)
feature_names <- seq_len(nrow(aligned$pk.times))
diff --git a/conda/environment-dev.yaml b/conda/environment-dev.yaml
index 663bf593..7b7b3b83 100644
--- a/conda/environment-dev.yaml
+++ b/conda/environment-dev.yaml
@@ -26,4 +26,9 @@ dependencies:
- r-tidyr
- r-stringr
- r-datacomparer
- - r-tibble
\ No newline at end of file
+ - r-tibble
+ - r-languageserver
+ - r-jsonlite
+ - r-styler
+ - r-datacomparer
+ - radian
diff --git a/tests/testdata/extracted_features/extracted_0.parquet b/tests/testdata/extracted_features/extracted_0.parquet
new file mode 100644
index 00000000..7a8661b8
Binary files /dev/null and b/tests/testdata/extracted_features/extracted_0.parquet differ
diff --git a/tests/testdata/extracted_features/extracted_1.parquet b/tests/testdata/extracted_features/extracted_1.parquet
new file mode 100644
index 00000000..7a8661b8
Binary files /dev/null and b/tests/testdata/extracted_features/extracted_1.parquet differ
diff --git a/tests/testdata/extracted_features/extracted_2.parquet b/tests/testdata/extracted_features/extracted_2.parquet
new file mode 100644
index 00000000..7a8661b8
Binary files /dev/null and b/tests/testdata/extracted_features/extracted_2.parquet differ
diff --git a/tests/testdata/proc_cdf_expected.Rds b/tests/testdata/proc_cdf_expected.Rds
new file mode 100644
index 00000000..b64a84d8
Binary files /dev/null and b/tests/testdata/proc_cdf_expected.Rds differ
diff --git a/tests/testdata/prof_to_features_expected.Rds b/tests/testdata/prof_to_features_expected.Rds
new file mode 100644
index 00000000..97791b3b
Binary files /dev/null and b/tests/testdata/prof_to_features_expected.Rds differ
diff --git a/tests/testthat/test-extract_features.R b/tests/testthat/test-extract_features.R
new file mode 100644
index 00000000..051d1f44
--- /dev/null
+++ b/tests/testthat/test-extract_features.R
@@ -0,0 +1,33 @@
+test_that("extract single feature works", {
+ filename <- c("../testdata/mbr_test0.mzml")
+
+ extracted_features <- proc.cdf(filename, cache = FALSE)
+ actual <- prof.to.features(extracted_features, do.plot = FALSE)
+
+ expected <- as.data.frame(arrow::read_parquet("../testdata/extracted_features/extracted_0.parquet"))
+ actual <- as.data.frame(actual)
+
+ actual <- unique(actual)
+ expected <- unique(expected)
+
+ keys <- c("mz", "pos", "sd1", "sd2")
+
+ actual <- dplyr::arrange_at(
+ actual, keys
+ )
+ expected <- dplyr::arrange_at(
+ expected, keys
+ )
+
+ report <- dataCompareR::rCompare(actual, expected, keys = keys, roundDigits = 4, mismatches = 100000)
+ dataCompareR::saveReport(report, reportName = "extract_features_0", showInViewer = FALSE, HTMLReport = FALSE, mismatchCount = 10000)
+
+ expect_true(nrow(report$rowMatching$inboth) >= 0.9 * nrow(expected))
+
+ incommon <- as.numeric(rownames(report$rowMatching$inboth))
+
+ subset_actual <- actual %>% dplyr::slice(incommon)
+ subset_expected <- expected %>% dplyr::slice(incommon)
+
+ expect_equal(subset_actual$area, subset_expected$area, tolerance = 0.01 * max(expected$area))
+})
diff --git a/tests/testthat/test-proc.cdf.R b/tests/testthat/test-proc.cdf.R
new file mode 100644
index 00000000..5d987406
--- /dev/null
+++ b/tests/testthat/test-proc.cdf.R
@@ -0,0 +1,6 @@
+test_that("test proc.cdf test", {
+ filename <- c('../testdata/mbr_test0.mzml')
+ actual <- proc.cdf(filename, cache = FALSE)
+ expected <- readRDS('../testdata/proc_cdf_expected.Rds')
+ expect_equal(actual, expected)
+})
\ No newline at end of file
diff --git a/tests/testthat/test-prof.to.features.R b/tests/testthat/test-prof.to.features.R
new file mode 100644
index 00000000..5b43dcdf
--- /dev/null
+++ b/tests/testthat/test-prof.to.features.R
@@ -0,0 +1,11 @@
+test_that("prof.to.features works", {
+ extracted_features <- readRDS('../testdata/proc_cdf_expected.Rds')
+ actual <- prof.to.features(
+ extracted_features,
+ sd.cut = c(0.1, 100),
+ sigma.ratio.lim = c(0.1, 10),
+ do.plot = FALSE)
+
+ expected <- readRDS("../testdata/prof_to_features_expected.Rds")
+ expect_equal(actual, expected)
+})
\ No newline at end of file