diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 55069af9..3a49acbf 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -12,19 +12,29 @@ "features": { "git": { "version": "latest", - "ppa": false + "ppa": false, }, "common": { "installZsh": "false", "username": "false", - } }, // Add the IDs of extensions you want installed when the container is created. - "extensions": ["ikuyadeu.r"], + "extensions": [ + "reditorsupport.r", + "rdebugger.r-debugger", + "eamodio.gitlens", + "mutantdino.resourcemonitor", + "meakbiyik.vscode-r-test-adapter", + "dvirtz.parquet-viewer" + ], + "settings": { + "r.rterm.linux": "/bin/local/miniconda/envs/recetox-aplcms/bin/radian", + "r.rpath.linux": "/bin/local/miniconda/envs/recetox-aplcms/bin/R" + }, - "onCreateCommand": "apt update && apt install -y locales && locale-gen en_US.UTF-8", + "onCreateCommand": "apt update && apt install -y locales && locale-gen en_US.UTF-8 && git config --global --add safe.directory /workspaces/recetox-aplcms", "postAttachCommand": "/bin/bash" } diff --git a/DESCRIPTION b/DESCRIPTION index 0e9a580b..2c3a32a3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,3 +17,4 @@ NeedsCompilation: no Suggests: arrow, dataCompareR, testthat (>= 3.0.0) Config/testthat/edition: 3 +RoxygenNote: 7.2.0 diff --git a/R/adaptive.bin.R b/R/adaptive.bin.R index c87e6755..f70eefd4 100644 --- a/R/adaptive.bin.R +++ b/R/adaptive.bin.R @@ -1,162 +1,191 @@ -adaptive.bin <- -function(x, min.run, min.pres, tol, baseline.correct, weighted=FALSE) -{ - masses<-x$masses - labels<-x$labels - intensi<-x$intensi - times<-x$times - - rm(x) - - base.curve<-unique(times) - base.curve<-base.curve[order(base.curve)] - base.curve<-cbind(base.curve, base.curve*0) - - curr.order<-order(masses) - intensi<-intensi[curr.order] - labels<-labels[curr.order] - masses<-masses[curr.order] - - rm(curr.order) - - cat(c("m/z tolerance is: ", tol,"\n")) - - l<-length(masses) - - curr.bw<-0.5*tol*max(masses) - if(weighted) - { - all.mass.den<-density(masses, weights=intensi/sum(intensi), bw=curr.bw, n=2^min(15, floor(log2(l))-2)) - }else{ - all.mass.den<-density(masses, bw=curr.bw, n=2^min(15, floor(log2(l))-2)) - } - all.mass.turns<-find.turn.point(all.mass.den$y) - all.mass.vlys<-all.mass.den$x[all.mass.turns$vlys] - breaks<-c(0, unique(round(approx(masses,1:l,xout=all.mass.vlys,rule=2,ties='ordered')$y))[-1]) - - grps<-masses*0 # this is which peak group the signal belongs to, not time group - - min.lab<-min(labels) - max.lab<-max(labels) - times<-unique(labels) - times<-times[order(times)] - curr.label<-1 - min.count.run<-min.run*length(times)/(max(times)-min(times)) - time.range<-diff(range(times)) - aver.time.range<-(max(labels)-min(labels))/length(times) - - newprof<-matrix(0, nrow=length(masses),ncol=4) - height.rec<-matrix(0, nrow=length(masses),ncol=3) - prof.pointer<-1 - height.pointer<-1 - - for(i in 1:(length(breaks)-1)) - { - this.labels<-labels[(breaks[i]+1):breaks[i+1]] - if(length(unique(this.labels)) >= min.count.run*min.pres) - { - this.masses<-masses[(breaks[i]+1):breaks[i+1]] - this.intensi<-intensi[(breaks[i]+1):breaks[i+1]] - - curr.order<-order(this.labels) - this.masses<-this.masses[curr.order] - this.intensi<-this.intensi[curr.order] - this.labels<-this.labels[curr.order] - - this.bw=0.5*tol*median(this.masses) - if(weighted) - { - mass.den<-density(this.masses, weights=this.intensi/sum(this.intensi), bw=this.bw) - }else{ - mass.den<-density(this.masses, bw=this.bw) - } - mass.den$y[mass.den$y < min(this.intensi)/10]<-0 - mass.turns<-find.turn.point(mass.den$y) - mass.pks<-mass.den$x[mass.turns$pks] - mass.vlys<-c(-Inf, mass.den$x[mass.turns$vlys], Inf) - - - for(j in 1:length(mass.pks)) - { - mass.lower<-max(mass.vlys[mass.vlys < mass.pks[j]]) - mass.upper<-min(mass.vlys[mass.vlys > mass.pks[j]]) - - if(length(mass.pks) == 1) mass.lower<-mass.lower-1 - mass.sel<-which(this.masses > mass.lower & this.masses <= mass.upper) - - if(length(mass.sel) > 0) - { - that.labels<-this.labels[mass.sel] - that.masses<-this.masses[mass.sel] - that.intensi<-this.intensi[mass.sel] - - that.merged<-combine.seq.3(that.labels, that.masses, that.intensi) - if(nrow(that.merged)==1) - { - new.merged<-that.merged - }else{ - new.merged<-that.merged[order(that.merged[,1]),] - } - - that.labels<-new.merged[,2] - that.masses<-new.merged[,1] - that.intensi<-new.merged[,3] - that.range<-diff(range(that.labels)) - - if(that.range > 0.5*time.range & length(that.labels) > that.range*min.pres & length(that.labels)/(diff(range(that.labels))/aver.time.range) > min.pres) - { - that.intensi<-rm.ridge(that.labels, that.intensi, bw=max(10*min.run, that.range/2)) - - that.sel<-which(that.intensi != 0) - that.labels<-that.labels[that.sel] - that.masses<-that.masses[that.sel] - that.intensi<-that.intensi[that.sel] - } - that.n<-length(that.masses) - newprof[prof.pointer:(prof.pointer+that.n-1),]<-cbind(that.masses, that.labels, that.intensi, rep(curr.label, that.n)) - prof.pointer<-prof.pointer+that.n - height.rec[height.pointer,]<-c(curr.label,that.n, max(that.intensi)) - height.pointer<-height.pointer+1 - curr.label<-curr.label+1 - } - } - }else{ - if(runif(1)<0.05) - { - this.masses<-masses[(breaks[i]+1):breaks[i+1]] - this.intensi<-intensi[(breaks[i]+1):breaks[i+1]] - curr.order<-order(this.labels) - this.masses<-this.masses[curr.order] - this.intensi<-this.intensi[curr.order] - this.labels<-this.labels[curr.order] - - - that.merged<-combine.seq.3(this.labels, this.masses, this.intensi) - that.n<-nrow(that.merged) - newprof[prof.pointer:(prof.pointer+that.n-1),]<-cbind(that.merged, rep(curr.label, that.n)) - prof.pointer<-prof.pointer+that.n - - height.rec[height.pointer,]<-c(curr.label,that.n, max(that.merged[,3])) - height.pointer<-height.pointer+1 - curr.label<-curr.label+1 - } +compute_densities <- function(masses, tol, weighted, intensities, bw_func, n = 512) { + bandwidth <- 0.5 * tol * bw_func(masses) + if (weighted) { + weights <- intensities / sum(intensities) + all.mass.den <- density(masses, weights = weights, bw = bandwidth, n = n) + } else { + all.mass.den <- density(masses, bw = bandwidth, n = n) + } + return(all.mass.den) +} + +compute_mass_values <- function(tol, masses, intensi, weighted) { + n <- 2^min(15, floor(log2(length(masses))) - 2) + + all.mass.den <- compute_densities(masses, tol, weighted, intensi, max, n) + + all.mass.turns <- find.turn.point(all.mass.den$y) + all.mass.vlys <- all.mass.den$x[all.mass.turns$vlys] + return(all.mass.vlys) +} + +compute_breaks <- function(tol, masses, intensi, weighted) { + all.mass.vlys <- compute_mass_values(tol, masses, intensi, weighted) + breaks <- c(0, unique(round(approx(masses, 1:length(masses), xout = all.mass.vlys, rule = 2, ties = "ordered")$y))[-1]) + return(breaks) +} + +adaptive.bin <- function(x, + min.run, + min.pres, + tol, + baseline.correct, + weighted = FALSE) { + # order inputs after mz values + masses <- x$masses + labels <- x$labels + intensi <- x$intensi + times <- x$times + + rm(x) + + curr.order <- order(masses) + intensi <- intensi[curr.order] + labels <- labels[curr.order] + masses <- masses[curr.order] + + rm(curr.order) + + cat(c("m/z tolerance is: ", tol, "\n")) + + times <- unique(labels) + times <- times[order(times)] + + # calculate function parameters + min.count.run <- min.run * length(times) / (max(times) - min(times)) + time.range <- diff(range(times)) + aver.time.range <- (max(labels) - min(labels)) / length(times) + + # init data + newprof <- matrix(0, nrow = length(masses), ncol = 4) + height.rec <- matrix(0, nrow = length(masses), ncol = 3) + + # init counters + curr.label <- 1 + prof.pointer <- 1 + height.pointer <- 1 + + breaks <- compute_breaks(tol, masses, intensi, weighted) + + for (i in 1:(length(breaks) - 1)) + { + start <- breaks[i] + 1 + end <- breaks[i + 1] + # get number of scans in bin + this.labels <- labels[start: end] + + if (length(unique(this.labels)) >= min.count.run * min.pres) { + # extract mz and intensity values for bin + this.masses <- masses[start:end] + this.intensi <- intensi[start:end] + + # reorder in order of labels (scan number) + curr.order <- order(this.labels) + this.masses <- this.masses[curr.order] + this.intensi <- this.intensi[curr.order] + this.labels <- this.labels[curr.order] + + mass.den <- compute_densities(this.masses, tol, weighted, this.intensi, median) + + mass.den$y[mass.den$y < min(this.intensi) / 10] <- 0 + mass.turns <- find.turn.point(mass.den$y) + mass.pks <- mass.den$x[mass.turns$pks] + mass.vlys <- c(-Inf, mass.den$x[mass.turns$vlys], Inf) + + + for (j in 1:length(mass.pks)) + { + # compute boundaries + mass.lower <- max(mass.vlys[mass.vlys < mass.pks[j]]) + mass.upper <- min(mass.vlys[mass.vlys > mass.pks[j]]) + + if (length(mass.pks) == 1) mass.lower <- mass.lower - 1 + + # compute if we are in mass range from mass.lower to mass.upper + mass.sel <- which(this.masses > mass.lower & this.masses <= mass.upper) + + if (length(mass.sel) > 0) { + + # get rows which fulfill condition + that.labels <- this.labels[mass.sel] + that.masses <- this.masses[mass.sel] + that.intensi <- this.intensi[mass.sel] + + # rearrange in order of labels + that.merged <- combine.seq.3(that.labels, that.masses, that.intensi) + if (nrow(that.merged) == 1) { + new.merged <- that.merged + } else { + new.merged <- that.merged[order(that.merged[, 1]), ] + } + + that.labels <- new.merged[, 2] + that.masses <- new.merged[, 1] + that.intensi <- new.merged[, 3] + that.range <- diff(range(that.labels)) + + if (that.range > 0.5 * time.range & length(that.labels) > that.range * min.pres & length(that.labels) / (diff(range(that.labels)) / aver.time.range) > min.pres) { + that.intensi <- rm.ridge(that.labels, that.intensi, bw = max(10 * min.run, that.range / 2)) + + # filter out 0 entries + that.sel <- which(that.intensi != 0) + that.labels <- that.labels[that.sel] + that.masses <- that.masses[that.sel] + that.intensi <- that.intensi[that.sel] + } + + that.n <- length(that.masses) + + newprof[prof.pointer:(prof.pointer + that.n - 1), ] <- cbind(that.masses, that.labels, that.intensi, rep(curr.label, that.n)) + height.rec[height.pointer, ] <- c(curr.label, that.n, max(that.intensi)) + + # increment counters + prof.pointer <- prof.pointer + that.n + height.pointer <- height.pointer + 1 + curr.label <- curr.label + 1 } + } + } else { + if (runif(1) < 0.05) { + + # reassignment + this.masses <- masses[start:end] + this.intensi <- intensi[start:end] + + # reordering + curr.order <- order(this.labels) + this.masses <- this.masses[curr.order] + this.intensi <- this.intensi[curr.order] + this.labels <- this.labels[curr.order] + + that.merged <- combine.seq.3(this.labels, this.masses, this.intensi) + that.n <- nrow(that.merged) + + newprof[prof.pointer:(prof.pointer + that.n - 1), ] <- cbind(that.merged, rep(curr.label, that.n)) + height.rec[height.pointer, ] <- c(curr.label, that.n, max(that.merged[, 3])) + + # increment counters + prof.pointer <- prof.pointer + that.n + height.pointer <- height.pointer + 1 + curr.label <- curr.label + 1 + } } - - newprof<-newprof[1:(prof.pointer-1),] - height.rec<-height.rec[1:(height.pointer-1),] - - newprof<-newprof[order(newprof[,1],newprof[,2]),] - - raw.prof<-new("list") - raw.prof$height.rec<-height.rec - raw.prof$masses<-newprof[,1] - raw.prof$labels<-newprof[,2] - raw.prof$intensi<-newprof[,3] - raw.prof$grps<-newprof[,4] - raw.prof$times<-times - raw.prof$tol<-tol - raw.prof$min.count.run<-min.count.run - - return(raw.prof) + } + + newprof <- newprof[1:(prof.pointer - 1), ] + height.rec <- height.rec[1:(height.pointer - 1), ] + + newprof <- newprof[order(newprof[, 1], newprof[, 2]), ] + + raw.prof <- new("list") + raw.prof$height.rec <- height.rec + raw.prof$masses <- newprof[, 1] + raw.prof$labels <- newprof[, 2] + raw.prof$intensi <- newprof[, 3] + raw.prof$grps <- newprof[, 4] + raw.prof$times <- times + raw.prof$tol <- tol + raw.prof$min.count.run <- min.count.run + + return(raw.prof) } diff --git a/R/extract_features.R b/R/extract_features.R new file mode 100644 index 00000000..eb8ea1e2 --- /dev/null +++ b/R/extract_features.R @@ -0,0 +1,63 @@ +extract_features <- function( + cluster, + filenames, + min_pres, + min_run, + mz_tol, + baseline_correct, + baseline_correct_noise_percentile, + intensity_weighted, + min_bandwidth, + max_bandwidth, + sd_cut, + sigma_ratio_lim, + shape_model, + peak_estim_method, + component_eliminate, + moment_power, + BIC_factor +) { + clusterExport(cluster, list( + 'proc.cdf', + 'prof.to.features', + 'load.lcms', + 'adaptive.bin', + 'find.turn.point', + 'combine.seq.3', + 'cont.index', + 'interpol.area', + 'load_file', + 'load_data', + 'plot_raw_profile_histogram', + 'compute_mass_values', + 'compute_densities', + 'compute_breaks' + )) + + parLapply(cluster, filenames, function(filename) { + profile <- proc.cdf( + filename = filename, + min.pres = min_pres, + min.run = min_run, + tol = mz_tol, + baseline.correct = baseline_correct, + baseline.correct.noise.percentile = baseline_correct_noise_percentile, + intensity.weighted = intensity_weighted, + do.plot = FALSE, + cache = FALSE + ) + features <- prof.to.features( + a = profile, + min.bw = min_bandwidth, + max.bw = max_bandwidth, + sd.cut = sd_cut, + sigma.ratio.lim = sigma_ratio_lim, + shape.model = shape_model, + estim.method = peak_estim_method, + component.eliminate = component_eliminate, + power = moment_power, + BIC.factor = BIC_factor, + do.plot = FALSE + ) + }) +} diff --git a/R/plot.R b/R/plot.R new file mode 100644 index 00000000..972f35da --- /dev/null +++ b/R/plot.R @@ -0,0 +1,34 @@ +plot_raw_profile_histogram <- function( + raw.prof, + min.pres, + baseline.correct, + baseline.correct.noise.percentile, + tol, + new.prof +) { + h.1 <- log10(raw.prof$height.rec[raw.prof$height.rec[, 2] <= max(2, raw.prof$min.count.run * min.pres / 2), 3]) + h.2 <- log10(raw.prof$height.rec[raw.prof$height.rec[, 2] >= raw.prof$min.count.run * min.pres, 3]) + + if (is.na(baseline.correct)) { + baseline.correct <- 10^quantile(h.1, baseline.correct.noise.percentile) + message(c("maximal height cut is automatically set at the", baseline.correct.noise.percentile, "percentile of noise group heights: ", baseline.correct)) + } else { + message(c("maximal height cut is provided by user: ", baseline.correct)) + } + par(mfrow = c(2, 2)) + + plot(c(-1, 1), c(-1, 1), type = "n", xlab = "", ylab = "", main = "tolerance level loaded", axes = FALSE) + text(x = 0, y = 0, tol, cex = 1.2) + + if (length(h.1) > 50) { + plot(density(h.1), xlab = "maximum height of group (log scale)", xlim = range(c(h.1, h.2)), main = "Black - noise groups \n Blue - selected groups") + } else { + plot(NA, NA, xlab = "maximum height of group (log scale)", xlim = range(c(h.1, h.2)), ylim = c(0, 1), main = "Black - noise groups \n Blue - selected groups") + if (length(h.1) > 0) abline(v = h.1) + } + + abline(v = log10(baseline.correct), col = "red") + lines(density(log10(new.prof$height.rec)), col = "blue") + hist(new.prof$time.range.rec, xlab = "Range of retention time in the same group", ylab = "Density", freq = FALSE, nclass = 100, main = "Group retention time range distribution") + hist(new.prof$mz.pres.rec, xlab = "% signal present in the same group", ylab = "Density", freq = FALSE, nclass = 20, main = "Group % present signal distribution") +} \ No newline at end of file diff --git a/R/proc.cdf.R b/R/proc.cdf.R index ce82f8fc..7c97c159 100644 --- a/R/proc.cdf.R +++ b/R/proc.cdf.R @@ -1,59 +1,73 @@ -proc.cdf <- function(filename, min.pres = 0.5, min.run = 12, tol = 1e-5, baseline.correct = 0, baseline.correct.noise.percentile = 0, do.plot = TRUE, intensity.weighted = FALSE, cache = TRUE) { +load_file <- function(filename) { + this <- load.lcms(filename) + + # this could eventually be replaced using drop_na + na.sel <- c(which(is.na(this$masses)), which(is.na(this$labels)), which(is.na(this$intensi))) + if (length(na.sel) > 0) { + na.sel <- unique(na.sel) + this$masses <- this$masses[-na.sel] + this$labels <- this$labels[-na.sel] + this$intensi <- this$intensi[-na.sel] + + warning("there are NA values in the m/z or intensity. Check the file:", filename) + } + # TODO + # this <- tidyr::drop_na(as.data.frame(this)) + return(this) +} + + +load_data <- function(filename, + cache, + min.run, + min.pres, + tol, + baseline.correct, + intensity.weighted) { rawprof_filename <- paste(strsplit(tolower(filename), "\\.")[[1]][1], "_", min.run, "_", min.pres, "_", tol, ".rawprof", sep = "") if (cache && file.exists(rawprof_filename)) { load(rawprof_filename) } else { - this <- load.lcms(filename) - - na.sel <- c(which(is.na(this$masses)), which(is.na(this$labels)), which(is.na(this$intensi))) - if (length(na.sel) > 0) { - na.sel <- unique(na.sel) - this$masses <- this$masses[-na.sel] - this$labels <- this$labels[-na.sel] - this$intensi <- this$intensi[-na.sel] - - warning("there are NA values in the m/z or intensity. Check the file:", filename) - } - raw.prof <- adaptive.bin(this, min.run = min.run, min.pres = min.pres, tol = tol, baseline.correct = baseline.correct, weighted = intensity.weighted) + raw.data <- load_file(filename) + raw.prof <- adaptive.bin(raw.data, min.run = min.run, min.pres = min.pres, tol = tol, baseline.correct = baseline.correct, weighted = intensity.weighted) } if (cache && !file.exists(rawprof_filename)) { save(raw.prof, file = rawprof_filename) } - newprof <- cbind(raw.prof$masses, raw.prof$labels, raw.prof$intensi, raw.prof$grps) - h.1 <- log10(raw.prof$height.rec[raw.prof$height.rec[, 2] <= max(2, raw.prof$min.count.run * min.pres / 2), 3]) - h.2 <- log10(raw.prof$height.rec[raw.prof$height.rec[, 2] >= raw.prof$min.count.run * min.pres, 3]) + return(raw.prof) +} - if (is.na(baseline.correct)) { - baseline.correct <- 10^quantile(h.1, baseline.correct.noise.percentile) - message(c("maximal height cut is automatically set at the", baseline.correct.noise.percentile, "percentile of noise group heights: ", baseline.correct)) - } else { - message(c("maximal height cut is provided by user: ", baseline.correct)) - } +#' Process file and return the profile. +#' @export +proc.cdf <- function(filename, + min.pres = 0.5, + min.run = 12, + tol = 1e-05, + baseline.correct = 0.0, + baseline.correct.noise.percentile = 0.05, + do.plot = FALSE, + intensity.weighted = FALSE, + cache = FALSE) { + raw.prof <- load_data(filename, cache, min.run, min.pres, tol, baseline.correct, intensity.weighted) + newprof <- cbind(raw.prof$masses, raw.prof$labels, raw.prof$intensi, raw.prof$grps) run.sel <- raw.prof$height.rec[which(raw.prof$height.rec[, 2] >= raw.prof$min.count.run * min.pres & raw.prof$height.rec[, 3] > baseline.correct), 1] - newprof <- newprof[newprof[, 4] %in% run.sel,] + + newprof <- newprof[newprof[, 4] %in% run.sel, ] new.prof <- cont.index(newprof, min.pres = min.pres, min.run = min.run) if (do.plot) { - par(mfrow = c(2, 2)) - - plot(c(-1,1), c(-1,1), type = "n", xlab = "", ylab = "", main = "tolerance level loaded", axes = FALSE) - text(x = 0, y = 0, tol, cex = 1.2) - - if (length(h.1) > 50) { - plot(density(h.1), xlab = "maximum height of group (log scale)", xlim = range(c(h.1, h.2)), main = "Black - noise groups \n Blue - selected groups") - } else { - plot(NA, NA, xlab = "maximum height of group (log scale)", xlim = range(c(h.1, h.2)), ylim = c(0, 1), main = "Black - noise groups \n Blue - selected groups") - if (length(h.1) > 0) abline(v = h.1) - } - - abline(v = log10(baseline.correct), col = "red") - lines(density(log10(new.prof$height.rec)), col = "blue") - hist(new.prof$time.range.rec, xlab = "Range of retention time in the same group", ylab = "Density", freq = FALSE, nclass = 100, main = "Group retention time range distribution") - hist(new.prof$mz.pres.rec, xlab = "% signal present in the same group", ylab = "Density", freq = FALSE, nclass = 20, main = "Group % present signal distribution") + plot_raw_profile_histogram( + raw.prof, + min.pres, + baseline.correct, + baseline.correct.noise.percentile, + tol, + new.prof + ) } return(new.prof$new.rec) diff --git a/R/prof.to.features.R b/R/prof.to.features.R index 54084d9e..c3c7efac 100644 --- a/R/prof.to.features.R +++ b/R/prof.to.features.R @@ -1,712 +1,683 @@ -prof.to.features <- -function(a, bandwidth=0.5, min.bw=NA, max.bw=NA, sd.cut=c(0.1,100), sigma.ratio.lim=c(0.1, 10), shape.model="bi-Gaussian", estim.method="moment",do.plot=TRUE, power=1, component.eliminate=0.01, BIC.factor=2) -{ - if(sum(shape.model %in% c("bi-Gaussian", "Gaussian")) == 0) - { +prof.to.features <- function(a, + bandwidth = 0.5, + min.bw = NA, + max.bw = NA, + sd.cut = c(0.01, 500), + sigma.ratio.lim = c(0.01, 100), + shape.model = "bi-Gaussian", + estim.method = "moment", + do.plot = TRUE, + power = 1, + component.eliminate = 0.01, + BIC.factor = 2) { + if (sum(shape.model %in% c("bi-Gaussian", "Gaussian")) == 0) { print("Error: peak shape model has to be Gaussian or bi-Gaussian") return(0) } - if(sum(estim.method %in% c("moment", "EM")) == 0) - { + if (sum(estim.method %in% c("moment", "EM")) == 0) { print("Error: peak model estimation method has to be moment or EM") return(0) } - + ##################### - bigauss.esti.EM <- function(t, x, max.iter=50, epsilon=0.005, power=1, do.plot=FALSE, truth=NA, sigma.ratio.lim=c(0.3, 1)) - { - + bigauss.esti.EM <- function(t, x, max.iter = 50, epsilon = 0.005, power = 1, do.plot = FALSE, truth = NA, sigma.ratio.lim = c(0.3, 1)) { + ## function takes into x and t, and then computes the value of ## sigma.1, sigma.2 and a using iterative method. the returned ## values include estimated sigmas, a and a boolean variable on ## whether the termination criteria is satified upon the end of the ## program. - - sel<-which(x>1e-10) - if(length(sel)==0) - { - return(c(median(t),1,1,0)) + + sel <- which(x > 1e-10) + if (length(sel) == 0) { + return(c(median(t), 1, 1, 0)) } - if(length(sel)==1) - { - return(c(t[sel], 1,1,0)) + if (length(sel) == 1) { + return(c(t[sel], 1, 1, 0)) } - t<-t[sel] - x<-x[sel] - solve.sigma <- function(x,t,a){ - + t <- t[sel] + x <- x[sel] + solve.sigma <- function(x, t, a) { + ## this function takes the value intensity level x, retention time t ## and assumed breaking point a, calculates the square estimated of ## sigma.1 and sigma.2. - - prep.uv <- function(x,t,a){ - + + prep.uv <- function(x, t, a) { + ## this function prepares the parameters required for latter ## compuation. u, v, and sum of x. - - temp <- (t-a)^2 * x - u <- sum(temp * as.numeric(t=a)) - return(list(u=u, - v=v, - x.sum=sum(x))) + + temp <- (t - a)^2 * x + u <- sum(temp * as.numeric(t < a)) + v <- sum(temp * as.numeric(t >= a)) + return(list( + u = u, + v = v, + x.sum = sum(x) + )) } - tt <- prep.uv(x,t,a) - sigma.1 <- tt$u/tt$x.sum*((tt$v/tt$u)^(1/3)+1) - sigma.2 <- tt$v/tt$x.sum*((tt$u/tt$v)^(1/3)+1) - return(list(sigma.1=sigma.1, - sigma.2=sigma.2)) + tt <- prep.uv(x, t, a) + sigma.1 <- tt$u / tt$x.sum * ((tt$v / tt$u)^(1 / 3) + 1) + sigma.2 <- tt$v / tt$x.sum * ((tt$u / tt$v)^(1 / 3) + 1) + return(list( + sigma.1 = sigma.1, + sigma.2 = sigma.2 + )) } - - solve.a <- function(x,t,a,sigma.1,sigma.2){ - + + solve.a <- function(x, t, a, sigma.1, sigma.2) { + ## thif function solves the value of a using the x, t, a from the ## previous step, and sigma.1, and sigma.2 - - w <- x * (as.numeric(t=a)/sigma.2) - return(sum(t * w)/ sum(w)) + + w <- x * (as.numeric(t < a) / sigma.1 + as.numeric(t >= a) / sigma.2) + return(sum(t * w) / sum(w)) } - + ## epsilon is the threshold for continuing the iteration. change in ## a smaller than epsilon will terminate the iteration. ## epsilon <- min(diff(sort(t)))/2 - + ## using the median value of t as the initial value of a. - a.old <- t[which(x==max(x))[1]] + a.old <- t[which(x == max(x))[1]] a.new <- a.old - change <- 10*epsilon - + change <- 10 * epsilon + ## n.iter is the number of iteration covered so far. n.iter <- 0 - - while((change>epsilon) & (n.iter epsilon) & (n.iter < max.iter)) { ## print(c(n.iter,change)) a.old <- a.new - n.iter <- n.iter+1 - sigma <- solve.sigma(x,t,a.old) - if(n.iter == 1) sigma[is.na(sigma)]<-as.numeric(sigma[which(!is.na(sigma))])[1]/10 - a.new <- solve.a(x,t,a.old,sigma$sigma.1,sigma$sigma.2) - change <- abs(a.old-a.new) + n.iter <- n.iter + 1 + sigma <- solve.sigma(x, t, a.old) + if (n.iter == 1) sigma[is.na(sigma)] <- as.numeric(sigma[which(!is.na(sigma))])[1] / 10 + a.new <- solve.a(x, t, a.old, sigma$sigma.1, sigma$sigma.2) + change <- abs(a.old - a.new) } # return(list(a=a.new, # sigma.1=sigma$sigma.1, # sigma.2=sigma$sigma.2, # iter.end=(max.iter>n.iter) # )) - d<-x - sigma$sigma.2<-sqrt(sigma$sigma.2) - sigma$sigma.1<-sqrt(sigma$sigma.1) - - d[t=a.new]<-dnorm(t[t>=a.new],mean=a.new,sd=sigma$sigma.2)*sigma$sigma.2 - scale<-exp(sum(d[d>1e-3]^2*log(x[d>1e-3]/d[d>1e-3]))/sum(d[d>1e-3]^2)) + d <- x + sigma$sigma.2 <- sqrt(sigma$sigma.2) + sigma$sigma.1 <- sqrt(sigma$sigma.1) + + d[t < a.new] <- dnorm(t[t < a.new], mean = a.new, sd = sigma$sigma.1) * sigma$sigma.1 + d[t >= a.new] <- dnorm(t[t >= a.new], mean = a.new, sd = sigma$sigma.2) * sigma$sigma.2 + scale <- exp(sum(d[d > 1e-3]^2 * log(x[d > 1e-3] / d[d > 1e-3])) / sum(d[d > 1e-3]^2)) return(c(a.new, sigma$sigma.1, sigma$sigma.2, scale)) } - - bigauss.esti<-function(x,y,power=1, do.plot=FALSE, truth=NA, sigma.ratio.lim=c(0.3, 3)) - { - sel<-which(y>1e-10) - if(length(sel)<2) - { - to.return<-c(median(x),1,1,0) - }else{ - x<-x[sel] - y<-y[sel] - # sel<-order(x) - # y<-y[sel] - # x<-x[sel] - - y.0<-y - if(do.plot) plot(x,y) - if(do.plot & ! is.na(truth[1])) - { - true.y1<-dnorm(x[x=truth[1]],mean=truth[1],sd=truth[3])*truth[3]*truth[4] - lines(x, c(true.y1,true.y2),col="green") + + bigauss.esti <- function(x, y, power = 1, do.plot = FALSE, truth = NA, sigma.ratio.lim = c(0.3, 3)) { + sel <- which(y > 1e-10) + if (length(sel) < 2) { + to.return <- c(median(x), 1, 1, 0) + } else { + x <- x[sel] + y <- y[sel] + # sel<-order(x) + # y<-y[sel] + # x<-x[sel] + + y.0 <- y + if (do.plot) plot(x, y) + if (do.plot & !is.na(truth[1])) { + true.y1 <- dnorm(x[x < truth[1]], mean = truth[1], sd = truth[2]) * truth[2] * truth[4] + true.y2 <- dnorm(x[x >= truth[1]], mean = truth[1], sd = truth[3]) * truth[3] * truth[4] + lines(x, c(true.y1, true.y2), col = "green") } - max.y.0<-max(y.0,na.rm=TRUE) - y<-(y/max.y.0)^power - - l<-length(x) - min.d<-min(diff(x)) - dx<-c(x[2]-x[1], (x[3:l]-x[1:(l-2)])/2, x[l]-x[l-1]) - if(l ==2) dx=rep(diff(x),2) - dx[dx>4*min.d]<-4*min.d - - y.cum<-cumsum(y*dx) - x.y.cum<-cumsum(y*x*dx) - xsqr.y.cum<-cumsum(y*x^2*dx) - - y.cum.rev<-cumsum((y*dx)[l:1])[l:1] # reverse cum sum - x.y.cum.rev<-cumsum((x*y*dx)[l:1])[l:1] - xsqr.y.cum.rev<-cumsum((y*x^2*dx)[l:1])[l:1] - - sel<-which(y.cum >= sigma.ratio.lim[1]/(sigma.ratio.lim[1] + 1)*y.cum[l]) - if(length(sel)>0) - { - start<-max(1, min(sel)) - }else{ - start<-1 + max.y.0 <- max(y.0, na.rm = TRUE) + y <- (y / max.y.0)^power + + l <- length(x) + min.d <- min(diff(x)) + dx <- c(x[2] - x[1], (x[3:l] - x[1:(l - 2)]) / 2, x[l] - x[l - 1]) + if (l == 2) dx <- rep(diff(x), 2) + dx[dx > 4 * min.d] <- 4 * min.d + + y.cum <- cumsum(y * dx) + x.y.cum <- cumsum(y * x * dx) + xsqr.y.cum <- cumsum(y * x^2 * dx) + + y.cum.rev <- cumsum((y * dx)[l:1])[l:1] # reverse cum sum + x.y.cum.rev <- cumsum((x * y * dx)[l:1])[l:1] + xsqr.y.cum.rev <- cumsum((y * x^2 * dx)[l:1])[l:1] + + sel <- which(y.cum >= sigma.ratio.lim[1] / (sigma.ratio.lim[1] + 1) * y.cum[l]) + if (length(sel) > 0) { + start <- max(1, min(sel)) + } else { + start <- 1 } - sel<-which(y.cum <= sigma.ratio.lim[2]/(sigma.ratio.lim[2] + 1)*y.cum[l]) - if(length(sel)>0) - { - end<-min(l-1, max(sel)) - }else{ - end<-l-1 + sel <- which(y.cum <= sigma.ratio.lim[2] / (sigma.ratio.lim[2] + 1) * y.cum[l]) + if (length(sel) > 0) { + end <- min(l - 1, max(sel)) + } else { + end <- l - 1 } - if(end <= start) - { - m<-min(mean(x[start:end]), x[max(which(y.cum.rev>0))]) - }else{ - m.candi<-x[start:end]+diff(x[start:(end+1)])/2 - rec<-matrix(0, ncol=3, nrow=end-start+1) - - s1<-sqrt((xsqr.y.cum[start:end]+m.candi^2*y.cum[start:end]-2*m.candi*x.y.cum[start:end])/y.cum[start:end]) - s2<-sqrt((xsqr.y.cum.rev[start:end+1]+m.candi^2*y.cum.rev[start:end+1]-2*m.candi*x.y.cum.rev[start:end+1])/y.cum.rev[start:end+1]) - rec[,1]<-s1 - rec[,2]<-s2 - rec[,3]<-y.cum[start:end]/y.cum.rev[start:end+1] - - d<-log(rec[,1]/rec[,2])-log(rec[,3]) - if(min(d,na.rm=TRUE)*max(d,na.rm=TRUE) < 0) - { - sel<-c(which(d==max(d[d<0]))[1], which(d==min(d[d>=0]))) - m<-(sum(abs(d[sel])*m.candi[sel]))/(sum(abs(d[sel]))) - }else{ - d<-abs(d) - m<-m.candi[which(d==min(d,na.rm=TRUE))[1]] + if (end <= start) { + m <- min(mean(x[start:end]), x[max(which(y.cum.rev > 0))]) + } else { + m.candi <- x[start:end] + diff(x[start:(end + 1)]) / 2 + rec <- matrix(0, ncol = 3, nrow = end - start + 1) + + s1 <- sqrt((xsqr.y.cum[start:end] + m.candi^2 * y.cum[start:end] - 2 * m.candi * x.y.cum[start:end]) / y.cum[start:end]) + s2 <- sqrt((xsqr.y.cum.rev[start:end + 1] + m.candi^2 * y.cum.rev[start:end + 1] - 2 * m.candi * x.y.cum.rev[start:end + 1]) / y.cum.rev[start:end + 1]) + rec[, 1] <- s1 + rec[, 2] <- s2 + rec[, 3] <- y.cum[start:end] / y.cum.rev[start:end + 1] + + d <- log(rec[, 1] / rec[, 2]) - log(rec[, 3]) + if (min(d, na.rm = TRUE) * max(d, na.rm = TRUE) < 0) { + sel <- c(which(d == max(d[d < 0]))[1], which(d == min(d[d >= 0]))) + m <- (sum(abs(d[sel]) * m.candi[sel])) / (sum(abs(d[sel]))) + } else { + d <- abs(d) + m <- m.candi[which(d == min(d, na.rm = TRUE))[1]] } } - - if(do.plot) abline(v=m) - - sel1<-which(x=m) - s1<-sqrt(sum((x[sel1]-m)^2*y[sel1]*dx[sel1])/sum(y[sel1]*dx[sel1])) - s2<-sqrt(sum((x[sel2]-m)^2*y[sel2]*dx[sel2])/sum(y[sel2]*dx[sel2])) - - - if(power != 1) - { - s1<-s1*sqrt(power) - s2<-s2*sqrt(power) + + if (do.plot) abline(v = m) + + sel1 <- which(x < m) + sel2 <- which(x >= m) + s1 <- sqrt(sum((x[sel1] - m)^2 * y[sel1] * dx[sel1]) / sum(y[sel1] * dx[sel1])) + s2 <- sqrt(sum((x[sel2] - m)^2 * y[sel2] * dx[sel2]) / sum(y[sel2] * dx[sel2])) + + + if (power != 1) { + s1 <- s1 * sqrt(power) + s2 <- s2 * sqrt(power) } - - d1<-dnorm(x[sel1], sd=s1, mean=m) - d2<-dnorm(x[sel2], sd=s2, mean=m) - d<-c(d1*s1,d2*s2) # notice this "density" doesnt integrate to 1. Rather it integrates to (s1+s2)/2 - y<-y.0 - - dy.ratio<-d^2*log(y/d) - dy.ratio[is.na(dy.ratio)]<-0 - dy.ratio[dy.ratio == -Inf]<-0 - dy.ratio[dy.ratio == Inf]<-0 - - - scale<-exp(sum(dy.ratio)/sum(d^2)) - - if(do.plot) - { - fit.1<-d*scale - lines(x[y>0],fit.1,col="red") + + d1 <- dnorm(x[sel1], sd = s1, mean = m) + d2 <- dnorm(x[sel2], sd = s2, mean = m) + d <- c(d1 * s1, d2 * s2) # notice this "density" doesnt integrate to 1. Rather it integrates to (s1+s2)/2 + y <- y.0 + + dy.ratio <- d^2 * log(y / d) + dy.ratio[is.na(dy.ratio)] <- 0 + dy.ratio[dy.ratio == -Inf] <- 0 + dy.ratio[dy.ratio == Inf] <- 0 + + + scale <- exp(sum(dy.ratio) / sum(d^2)) + + if (do.plot) { + fit.1 <- d * scale + lines(x[y > 0], fit.1, col = "red") } - - to.return<-c(m,s1,s2,scale) - if(sum(is.na(to.return)) > 0) - { - m<-sum(x*y)/sum(y) - s1<-s2<-sum(y*(x-m)^2)/sum(y) - scale<-sum(y)/s1 - to.return<-c(m,s1,s2,scale) + + to.return <- c(m, s1, s2, scale) + if (sum(is.na(to.return)) > 0) { + m <- sum(x * y) / sum(y) + s1 <- s2 <- sum(y * (x - m)^2) / sum(y) + scale <- sum(y) / s1 + to.return <- c(m, s1, s2, scale) } } return(to.return) } - + ############## - bigauss.mix<-function(x,y,power=1, do.plot=FALSE, sigma.ratio.lim=c(0.1, 10), bw=c(15,30,60), eliminate=.05, max.iter=25, estim.method, BIC.factor=2) - { - all.bw<-bw[order(bw)] - - x.0<-x - y.0<-y - - sel<-y>1e-5 - x<-x[sel] - y<-y[sel] - sel<-order(x) - y<-y[sel] - x<-x[sel] - results<-new("list") - smoother.pk.rec<-smoother.vly.rec<-new("list") - bic.rec<-all.bw - - if(do.plot) - { - par(mfrow=c(ceiling(length(all.bw)/2),2)) - par(mar=c(1,1,1,1)) + bigauss.mix <- function(x, y, power = 1, do.plot = FALSE, sigma.ratio.lim = c(0.1, 10), bw = c(15, 30, 60), eliminate = .05, max.iter = 25, estim.method, BIC.factor = 2) { + all.bw <- bw[order(bw)] + + x.0 <- x + y.0 <- y + + sel <- y > 1e-5 + x <- x[sel] + y <- y[sel] + sel <- order(x) + y <- y[sel] + x <- x[sel] + results <- new("list") + smoother.pk.rec <- smoother.vly.rec <- new("list") + bic.rec <- all.bw + + if (do.plot) { + par(mfrow = c(ceiling(length(all.bw) / 2), 2)) + par(mar = c(1, 1, 1, 1)) } - - last.num.pks<-Inf - - for(bw.n in length(all.bw):1) + + last.num.pks <- Inf + + for (bw.n in length(all.bw):1) { - bw<-all.bw[bw.n] - this.smooth<-ksmooth(x.0,y.0, kernel="normal", bandwidth=bw) - turns<-find.turn.point(this.smooth$y) - pks<-this.smooth$x[turns$pks] - vlys<-c(-Inf, this.smooth$x[turns$vlys], Inf) - - smoother.pk.rec[[bw.n]]<-pks - smoother.vly.rec[[bw.n]]<-vlys - if(length(pks) != last.num.pks) - { - last.num.pks<-length(pks) - l<-length(x) - dx<-c(x[2]-x[1], (x[3:l]-x[1:(l-2)])/2, x[l]-x[l-1]) - if(l ==2) dx=rep(diff(x),2) - + bw <- all.bw[bw.n] + this.smooth <- ksmooth(x.0, y.0, kernel = "normal", bandwidth = bw) + turns <- find.turn.point(this.smooth$y) + pks <- this.smooth$x[turns$pks] + vlys <- c(-Inf, this.smooth$x[turns$vlys], Inf) + + smoother.pk.rec[[bw.n]] <- pks + smoother.vly.rec[[bw.n]] <- vlys + if (length(pks) != last.num.pks) { + last.num.pks <- length(pks) + l <- length(x) + dx <- c(x[2] - x[1], (x[3:l] - x[1:(l - 2)]) / 2, x[l] - x[l - 1]) + if (l == 2) dx <- rep(diff(x), 2) + # initiation - m<-s1<-s2<-delta<-pks - for(i in 1:length(m)) + m <- s1 <- s2 <- delta <- pks + for (i in 1:length(m)) { - sel.1<-which(x >= max(vlys[vlys < m[i]]) & x < m[i]) - s1[i]<-sqrt(sum((x[sel.1]-m[i])^2 * y[sel.1]*dx[sel.1])/sum(y[sel.1]*dx[sel.1])) - - sel.2<-which(x >= m[i] & x < min(vlys[vlys > m[i]])) - s2[i]<-sqrt(sum((x[sel.2]-m[i])^2 * y[sel.2] * dx[sel.2])/sum(y[sel.2]*dx[sel.2])) - - delta[i]<-(sum(y[sel.1]*dx[sel.1]) + sum(y[sel.2]*dx[sel.2]))/((sum(dnorm(x[sel.1], mean=m[i], sd=s1[i])) * s1[i] /2)+(sum(dnorm(x[sel.2], mean=m[i], sd=s2[i])) * s2[i] /2)) + sel.1 <- which(x >= max(vlys[vlys < m[i]]) & x < m[i]) + s1[i] <- sqrt(sum((x[sel.1] - m[i])^2 * y[sel.1] * dx[sel.1]) / sum(y[sel.1] * dx[sel.1])) + + sel.2 <- which(x >= m[i] & x < min(vlys[vlys > m[i]])) + s2[i] <- sqrt(sum((x[sel.2] - m[i])^2 * y[sel.2] * dx[sel.2]) / sum(y[sel.2] * dx[sel.2])) + + delta[i] <- (sum(y[sel.1] * dx[sel.1]) + sum(y[sel.2] * dx[sel.2])) / ((sum(dnorm(x[sel.1], mean = m[i], sd = s1[i])) * s1[i] / 2) + (sum(dnorm(x[sel.2], mean = m[i], sd = s2[i])) * s2[i] / 2)) } - delta[is.na(delta)]<-1e-10 - s1[is.na(s1)]<-1e-10 - s2[is.na(s2)]<-1e-10 - - - fit<-matrix(0,ncol=length(m), nrow=length(x)) # this is the matrix of fitted values - - this.change=Inf - counter=0 - - while(this.change > 0.1 & counter <= max.iter) - { - counter<-counter+1 - old.m<-m - + delta[is.na(delta)] <- 1e-10 + s1[is.na(s1)] <- 1e-10 + s2[is.na(s2)] <- 1e-10 + + + fit <- matrix(0, ncol = length(m), nrow = length(x)) # this is the matrix of fitted values + + this.change <- Inf + counter <- 0 + + while (this.change > 0.1 & counter <= max.iter) { + counter <- counter + 1 + old.m <- m + # E step - cuts<-c(-Inf, m, Inf) - for(j in 2:length(cuts)) + cuts <- c(-Inf, m, Inf) + for (j in 2:length(cuts)) { - sel<-which(x >= cuts[j-1] & x < cuts[j]) - use.s1<-which(1:length(m) >= (j-1)) - s.to.use<-s2 - s.to.use[use.s1]<-s1[use.s1] - for(i in 1:ncol(fit)) + sel <- which(x >= cuts[j - 1] & x < cuts[j]) + use.s1 <- which(1:length(m) >= (j - 1)) + s.to.use <- s2 + s.to.use[use.s1] <- s1[use.s1] + for (i in 1:ncol(fit)) { - fit[sel,i]<-dnorm(x[sel], mean=m[i], sd = s.to.use[i]) * s.to.use[i] *delta[i] + fit[sel, i] <- dnorm(x[sel], mean = m[i], sd = s.to.use[i]) * s.to.use[i] * delta[i] } } - fit[is.na(fit)]<-0 - sum.fit<-apply(fit, 1, sum) - + fit[is.na(fit)] <- 0 + sum.fit <- apply(fit, 1, sum) + # Elimination step - fit<-fit/sum.fit - fit2<-fit * y - perc.explained<-apply(fit2,2,sum)/sum(y) - max.erase<-max(1, round(length(perc.explained)/5)) - - to.erase<-which(perc.explained <= min(eliminate, perc.explained[order(perc.explained, na.last=FALSE)[max.erase]])) - - - if(length(to.erase) > 0) - { - m<-m[-to.erase] - s1<-s1[-to.erase] - s2<-s2[-to.erase] - delta<-delta[-to.erase] - fit<-fit[,-to.erase] - if(is.null(ncol(fit))) fit<-matrix(fit, ncol=1) - sum.fit<-apply(fit, 1, sum) - fit<-fit/sum.fit - old.m<-old.m[-to.erase] + fit <- fit / sum.fit + fit2 <- fit * y + perc.explained <- apply(fit2, 2, sum) / sum(y) + max.erase <- max(1, round(length(perc.explained) / 5)) + + to.erase <- which(perc.explained <= min(eliminate, perc.explained[order(perc.explained, na.last = FALSE)[max.erase]])) + + + if (length(to.erase) > 0) { + m <- m[-to.erase] + s1 <- s1[-to.erase] + s2 <- s2[-to.erase] + delta <- delta[-to.erase] + fit <- fit[, -to.erase] + if (is.null(ncol(fit))) fit <- matrix(fit, ncol = 1) + sum.fit <- apply(fit, 1, sum) + fit <- fit / sum.fit + old.m <- old.m[-to.erase] } - + # M step - for(i in 1:length(m)) + for (i in 1:length(m)) { - this.y<-y * fit[,i] - if(estim.method=="moment") - { - this.fit<-bigauss.esti(x,this.y,power=power,do.plot=FALSE, sigma.ratio.lim=sigma.ratio.lim) - }else{ - this.fit<-bigauss.esti.EM(x,this.y,power=power,do.plot=FALSE, sigma.ratio.lim=sigma.ratio.lim) + this.y <- y * fit[, i] + if (estim.method == "moment") { + this.fit <- bigauss.esti(x, this.y, power = power, do.plot = FALSE, sigma.ratio.lim = sigma.ratio.lim) + } else { + this.fit <- bigauss.esti.EM(x, this.y, power = power, do.plot = FALSE, sigma.ratio.lim = sigma.ratio.lim) } - m[i]<-this.fit[1] - s1[i]<-this.fit[2] - s2[i]<-this.fit[3] - delta[i]<-this.fit[4] + m[i] <- this.fit[1] + s1[i] <- this.fit[2] + s2[i] <- this.fit[3] + delta[i] <- this.fit[4] } - delta[is.na(delta)]<-0 - - #amount of change - this.change<-sum((old.m-m)^2) + delta[is.na(delta)] <- 0 + + # amount of change + this.change <- sum((old.m - m)^2) } - cuts<-c(-Inf, m, Inf) - fit<-fit*0 - for(j in 2:length(cuts)) + cuts <- c(-Inf, m, Inf) + fit <- fit * 0 + for (j in 2:length(cuts)) { - sel<-which(x >= cuts[j-1] & x < cuts[j]) - use.s1<-which(1:length(m) >= (j-1)) - s.to.use<-s2 - s.to.use[use.s1]<-s1[use.s1] - for(i in 1:ncol(fit)) + sel <- which(x >= cuts[j - 1] & x < cuts[j]) + use.s1 <- which(1:length(m) >= (j - 1)) + s.to.use <- s2 + s.to.use[use.s1] <- s1[use.s1] + for (i in 1:ncol(fit)) { - if(s.to.use[i] != 0) - { - fit[sel,i]<-dnorm(x[sel], mean=m[i], sd = s.to.use[i]) * s.to.use[i] *delta[i] + if (s.to.use[i] != 0) { + fit[sel, i] <- dnorm(x[sel], mean = m[i], sd = s.to.use[i]) * s.to.use[i] * delta[i] } } } - - if(do.plot) - { - plot(x,y,cex=.1,main=paste("bw=",bw)) - sum.fit<-apply(fit, 1, sum) - lines(x,sum.fit) - abline(v=m) - cols<-c("red","green","blue","cyan","brown","black",rep("grey",100)) - for(i in 1:length(m)) + + if (do.plot) { + plot(x, y, cex = .1, main = paste("bw=", bw)) + sum.fit <- apply(fit, 1, sum) + lines(x, sum.fit) + abline(v = m) + cols <- c("red", "green", "blue", "cyan", "brown", "black", rep("grey", 100)) + for (i in 1:length(m)) { - lines(x, fit[,i],col=cols[i]) + lines(x, fit[, i], col = cols[i]) } } - area<-delta*(s1+s2)/2 - rss<-sum((y-apply(fit,1,sum))^2) - l<-length(x) - bic<-l*log(rss/l)+4*length(m)*log(l)*BIC.factor - results[[bw.n]]<-cbind(m,s1,s2,delta,area) - bic.rec[bw.n]<-bic - }else{ - results[[bw.n]]<-NA - bic.rec[bw.n]<-Inf - results[[bw.n]]<-results[[bw.n+1]] - + area <- delta * (s1 + s2) / 2 + rss <- sum((y - apply(fit, 1, sum))^2) + l <- length(x) + bic <- l * log(rss / l) + 4 * length(m) * log(l) * BIC.factor + results[[bw.n]] <- cbind(m, s1, s2, delta, area) + bic.rec[bw.n] <- bic + } else { + results[[bw.n]] <- NA + bic.rec[bw.n] <- Inf + results[[bw.n]] <- results[[bw.n + 1]] } } - sel<-which(bic.rec == min(bic.rec, na.rm=TRUE)) - if(length(sel) > 1) sel<-sel[which(all.bw[sel]==max(all.bw[sel]))] - rec<-new("list") - rec$param<-results[[sel]] - rec$smoother.pks<-smoother.pk.rec - rec$smoother.vlys<-smoother.vly.rec - rec$all.param<-results - rec$bic<-bic.rec + sel <- which(bic.rec == min(bic.rec, na.rm = TRUE)) + if (length(sel) > 1) sel <- sel[which(all.bw[sel] == max(all.bw[sel]))] + rec <- new("list") + rec$param <- results[[sel]] + rec$smoother.pks <- smoother.pk.rec + rec$smoother.vlys <- smoother.vly.rec + rec$all.param <- results + rec$bic <- bic.rec return(rec) } - + ############# - normix<-function(that.curve, pks, vlys, ignore=0.1, max.iter=50, prob.cut=1e-2) - { - x<-that.curve[,1] - y<-that.curve[,2] - - if(length(pks)==1) - { - miu<-sum(x*y)/sum(y) - sigma<-sqrt(sum(y*(x-miu)^2)/sum(y)) - fitted<-dnorm(x, mean=miu, sd=sigma) - this.sel<-y>0 & fitted/dnorm(miu,mean=miu,sd=sigma)>prob.cut - sc<-exp(sum(fitted[this.sel]^2*log(y[this.sel]/fitted[this.sel])/sum(fitted[this.sel]^2))) - }else{ - pks<-pks[order(pks)] - vlys<-vlys[order(vlys)] - l<-length(pks) - miu<-sigma<-sc<-pks - w<-matrix(0,nrow=l, ncol=length(x)) - - for(m in 1:l) + normix <- function(that.curve, pks, vlys, ignore = 0.1, max.iter = 50, prob.cut = 1e-2) { + x <- that.curve[, 1] + y <- that.curve[, 2] + + if (length(pks) == 1) { + miu <- sum(x * y) / sum(y) + sigma <- sqrt(sum(y * (x - miu)^2) / sum(y)) + fitted <- dnorm(x, mean = miu, sd = sigma) + this.sel <- y > 0 & fitted / dnorm(miu, mean = miu, sd = sigma) > prob.cut + sc <- exp(sum(fitted[this.sel]^2 * log(y[this.sel] / fitted[this.sel]) / sum(fitted[this.sel]^2))) + } else { + pks <- pks[order(pks)] + vlys <- vlys[order(vlys)] + l <- length(pks) + miu <- sigma <- sc <- pks + w <- matrix(0, nrow = l, ncol = length(x)) + + for (m in 1:l) { - this.low<-max(vlys[vlys<=pks[m]]) - this.high<-min(vlys[vlys>=pks[m]]) - this.x<-x[x>=this.low & x<=this.high] - this.y<-y[x>=this.low & x<=this.high] - - miu[m]<-sum(this.x*this.y)/sum(this.y) - #if(sum(this.y>0) > 1) - #{ - sigma[m]<-sqrt(sum(this.y*(this.x-miu[m])^2)/sum(this.y)) - #}else{ + this.low <- max(vlys[vlys <= pks[m]]) + this.high <- min(vlys[vlys >= pks[m]]) + this.x <- x[x >= this.low & x <= this.high] + this.y <- y[x >= this.low & x <= this.high] + + miu[m] <- sum(this.x * this.y) / sum(this.y) + # if(sum(this.y>0) > 1) + # { + sigma[m] <- sqrt(sum(this.y * (this.x - miu[m])^2) / sum(this.y)) + # }else{ # sigma[m]<-mean(diff(this.x))/2 - #} - fitted<-dnorm(this.x, mean=miu[m], sd=sigma[m]) - this.sel<-this.y>0 & fitted/dnorm(miu[m],mean=miu[m],sd=sigma[m]) > prob.cut - sc[m]<-exp(sum(fitted[this.sel]^2*log(this.y[this.sel]/fitted[this.sel])/sum(fitted[this.sel]^2))) - #sc[m]<-lm(this.y[this.sel]~fitted[this.sel]+0)$coef + # } + fitted <- dnorm(this.x, mean = miu[m], sd = sigma[m]) + this.sel <- this.y > 0 & fitted / dnorm(miu[m], mean = miu[m], sd = sigma[m]) > prob.cut + sc[m] <- exp(sum(fitted[this.sel]^2 * log(this.y[this.sel] / fitted[this.sel]) / sum(fitted[this.sel]^2))) + # sc[m]<-lm(this.y[this.sel]~fitted[this.sel]+0)$coef } - - to.erase<-which(is.na(miu) | is.na(sigma) | sigma==0 | is.na(sc)) - if(length(to.erase)>0) - { - l<-l-length(to.erase) - miu<-miu[-to.erase] - sigma<-sigma[-to.erase] - sc<-sc[-to.erase] - w<-w[-to.erase,] + + to.erase <- which(is.na(miu) | is.na(sigma) | sigma == 0 | is.na(sc)) + if (length(to.erase) > 0) { + l <- l - length(to.erase) + miu <- miu[-to.erase] + sigma <- sigma[-to.erase] + sc <- sc[-to.erase] + w <- w[-to.erase, ] } - - direc<-1 - diff<-1000 - iter<-0 - - while(diff>0.05 & iter0 & fitted/dnorm(miu,mean=miu,sd=sigma) > prob.cut - sc<-exp(sum(fitted[this.sel]^2*log(y[this.sel]/fitted[this.sel])/sum(fitted[this.sel]^2))) - #sc<-lm(y[this.sel]~fitted[this.sel]+0)$coef - break; + + direc <- 1 + diff <- 1000 + iter <- 0 + + while (diff > 0.05 & iter < max.iter) { + iter <- iter + 1 + if (l == 1) { + miu <- sum(x * y) / sum(y) + sigma <- sqrt(sum(y * (x - miu)^2) / sum(y)) + fitted <- dnorm(x, mean = miu, sd = sigma) + this.sel <- y > 0 & fitted / dnorm(miu, mean = miu, sd = sigma) > prob.cut + sc <- exp(sum(fitted[this.sel]^2 * log(y[this.sel] / fitted[this.sel]) / sum(fitted[this.sel]^2))) + # sc<-lm(y[this.sel]~fitted[this.sel]+0)$coef + break } - miu.0<-miu - sigma.0<-sigma - sc.0<-sc - - all.w<-y*0 - for(m in 1:l) + miu.0 <- miu + sigma.0 <- sigma + sc.0 <- sc + + all.w <- y * 0 + for (m in 1:l) { - all.w<-all.w+dnorm(x,mean=miu[m],sd=sigma[m])*sc[m] + all.w <- all.w + dnorm(x, mean = miu[m], sd = sigma[m]) * sc[m] } - - for(m in 1:l) + + for (m in 1:l) { - w[m,]<-dnorm(x,mean=miu[m],sd=sigma[m])*sc[m]/all.w + w[m, ] <- dnorm(x, mean = miu[m], sd = sigma[m]) * sc[m] / all.w } - - if(sum(is.na(w)) >0) break; - - for(m in 1:l) + + if (sum(is.na(w)) > 0) break + + for (m in 1:l) { - this.y<-y*w[m,] - miu[m]<-sum(x*this.y)/sum(this.y) - sigma[m]<-sqrt(sum(this.y*(x-miu[m])^2)/sum(this.y)) - fitted<-dnorm(x,mean=miu[m],sd=sigma[m]) - this.sel<-this.y>0 & fitted/dnorm(miu[m],mean=miu[m],sd=sigma[m])> prob.cut - sc[m]<-exp(sum(fitted[this.sel]^2*log(this.y[this.sel]/fitted[this.sel])/sum(fitted[this.sel]^2))) - #sc[m]<-lm(this.y[this.sel]~fitted[this.sel]+0)$coef + this.y <- y * w[m, ] + miu[m] <- sum(x * this.y) / sum(this.y) + sigma[m] <- sqrt(sum(this.y * (x - miu[m])^2) / sum(this.y)) + fitted <- dnorm(x, mean = miu[m], sd = sigma[m]) + this.sel <- this.y > 0 & fitted / dnorm(miu[m], mean = miu[m], sd = sigma[m]) > prob.cut + sc[m] <- exp(sum(fitted[this.sel]^2 * log(this.y[this.sel] / fitted[this.sel]) / sum(fitted[this.sel]^2))) + # sc[m]<-lm(this.y[this.sel]~fitted[this.sel]+0)$coef } - diff<-sum((miu.0-miu)^2) - - www<-w - for(m in 1:l) + diff <- sum((miu.0 - miu)^2) + + www <- w + for (m in 1:l) { - www[m,]<-www[m,]*y + www[m, ] <- www[m, ] * y } - www<-apply(www,1,sum) - www[which(is.na(sc))]<-0 - www<-www/sum(www) - max.erase<-max(1, round(l/5)) - - to.erase<-which(www <= min(ignore, www[order(www, na.last=FALSE)[max.erase]])) - - if(length(to.erase)>0) - { - l<-l-length(to.erase) - miu<-miu[-to.erase] - sigma<-sigma[-to.erase] - sc<-sc[-to.erase] - w<-w[-to.erase,] - diff<-1000 + www <- apply(www, 1, sum) + www[which(is.na(sc))] <- 0 + www <- www / sum(www) + max.erase <- max(1, round(l / 5)) + + to.erase <- which(www <= min(ignore, www[order(www, na.last = FALSE)[max.erase]])) + + if (length(to.erase) > 0) { + l <- l - length(to.erase) + miu <- miu[-to.erase] + sigma <- sigma[-to.erase] + sc <- sc[-to.erase] + w <- w[-to.erase, ] + diff <- 1000 } } } - l<-length(miu) - if(l==1) - { - rec<-matrix(c(miu, sigma, sc),nrow=1) - }else{ - rec<-cbind(miu,sigma,sc) + l <- length(miu) + if (l == 1) { + rec <- matrix(c(miu, sigma, sc), nrow = 1) + } else { + rec <- cbind(miu, sigma, sc) } - colnames(rec)<-c("miu","sigma","scale") + colnames(rec) <- c("miu", "sigma", "scale") return(rec) } - + ########## - - normix.bic<-function(x,y,power=2, do.plot=FALSE, bw=c(15,30,60), eliminate=.05, max.iter=50, BIC.factor=2) - { - all.bw<-bw[order(bw)] - sel<-y>1e-5 - x<-x[sel] - y<-y[sel] - sel<-order(x) - y<-y[sel] - x<-x[sel] - results<-new("list") - smoother.pk.rec<-smoother.vly.rec<-new("list") - bic.rec<-all.bw - - if(do.plot) - { - par(mfrow=c(ceiling(length(all.bw)/2),2)) - par(mar=c(1,1,1,1)) + + normix.bic <- function(x, y, power = 2, do.plot = FALSE, bw = c(15, 30, 60), eliminate = .05, max.iter = 50, BIC.factor = 2) { + all.bw <- bw[order(bw)] + sel <- y > 1e-5 + x <- x[sel] + y <- y[sel] + sel <- order(x) + y <- y[sel] + x <- x[sel] + results <- new("list") + smoother.pk.rec <- smoother.vly.rec <- new("list") + bic.rec <- all.bw + + if (do.plot) { + par(mfrow = c(ceiling(length(all.bw) / 2), 2)) + par(mar = c(1, 1, 1, 1)) } - - last.num.pks<-Inf - - for(bw.n in length(all.bw):1) + + last.num.pks <- Inf + + for (bw.n in length(all.bw):1) { - bw<-all.bw[bw.n] - this.smooth<-ksmooth(x,y, kernel="normal", bandwidth=bw) - turns<-find.turn.point(this.smooth$y) - pks<-this.smooth$x[turns$pks] - vlys<-c(-Inf, this.smooth$x[turns$vlys], Inf) - - smoother.pk.rec[[bw.n]]<-pks - smoother.vly.rec[[bw.n]]<-vlys - if(length(pks) != last.num.pks) - { - last.num.pks<-length(pks) - aaa<-normix(cbind(x,y), pks=pks, vlys=vlys, ignore=eliminate, max.iter=max.iter) - - total.fit<-x*0 - for(i in 1:nrow(aaa)) + bw <- all.bw[bw.n] + this.smooth <- ksmooth(x, y, kernel = "normal", bandwidth = bw) + turns <- find.turn.point(this.smooth$y) + pks <- this.smooth$x[turns$pks] + vlys <- c(-Inf, this.smooth$x[turns$vlys], Inf) + + smoother.pk.rec[[bw.n]] <- pks + smoother.vly.rec[[bw.n]] <- vlys + if (length(pks) != last.num.pks) { + last.num.pks <- length(pks) + aaa <- normix(cbind(x, y), pks = pks, vlys = vlys, ignore = eliminate, max.iter = max.iter) + + total.fit <- x * 0 + for (i in 1:nrow(aaa)) { - total.fit<-total.fit+dnorm(x,mean=aaa[i,1], sd=aaa[i,2])*aaa[i,3] + total.fit <- total.fit + dnorm(x, mean = aaa[i, 1], sd = aaa[i, 2]) * aaa[i, 3] } - - if(do.plot) - { - plot(x,y,cex=.1,main=paste("bw=",bw)) - abline(v=aaa[,1]) - cols<-c("red","green","blue","cyan","brown","black",rep("grey",100)) - for(i in 1:nrow(aaa)) + + if (do.plot) { + plot(x, y, cex = .1, main = paste("bw=", bw)) + abline(v = aaa[, 1]) + cols <- c("red", "green", "blue", "cyan", "brown", "black", rep("grey", 100)) + for (i in 1:nrow(aaa)) { - lines(x, dnorm(x,mean=aaa[i,1], sd=aaa[i,2])*aaa[i,3],col=cols[i]) + lines(x, dnorm(x, mean = aaa[i, 1], sd = aaa[i, 2]) * aaa[i, 3], col = cols[i]) } } - - rss<-sum((y-total.fit)^2) - l<-length(x) - bic<-l*log(rss/l)+3*nrow(aaa)*log(l)*BIC.factor - results[[bw.n]]<-aaa - bic.rec[bw.n]<-bic - }else{ - bic.rec[bw.n]<-Inf - results[[bw.n]]<-results[[bw.n+1]] - + + rss <- sum((y - total.fit)^2) + l <- length(x) + bic <- l * log(rss / l) + 3 * nrow(aaa) * log(l) * BIC.factor + results[[bw.n]] <- aaa + bic.rec[bw.n] <- bic + } else { + bic.rec[bw.n] <- Inf + results[[bw.n]] <- results[[bw.n + 1]] } } - sel<-which(bic.rec == min(bic.rec)) - if(length(sel) > 1) sel<-sel[which(all.bw[sel]==max(all.bw[sel]))] - rec<-new("list") - rec$param<-results[[sel]] - rec$smoother.pks<-smoother.pk.rec - rec$smoother.vlys<-smoother.vly.rec - rec$all.param<-results - rec$bic<-bic.rec + sel <- which(bic.rec == min(bic.rec)) + if (length(sel) > 1) sel <- sel[which(all.bw[sel] == max(all.bw[sel]))] + rec <- new("list") + rec$param <- results[[sel]] + rec$smoother.pks <- smoother.pk.rec + rec$smoother.vlys <- smoother.vly.rec + rec$all.param <- results + rec$bic <- bic.rec return(rec) } - + ########## - - if(is.na(min.bw)) min.bw<-diff(range(a[,2], na.rm=TRUE))/60 - if(is.na(max.bw)) max.bw<-diff(range(a[,2], na.rm=TRUE))/15 - if(min.bw >= max.bw) min.bw<-max.bw/4 - - base.curve<-unique(a[,2]) - all.span<-range(base.curve) - base.curve<-base.curve[order(base.curve)] - base.curve<-cbind(base.curve, base.curve*0) - all.times<-base.curve[,1] - if(all.times[1]>0) all.times<-c(0,all.times) - all.times<-c(all.times, 2*all.times[length(all.times)]-all.times[length(all.times)-1]) - all.times<-(all.times[1:(length(all.times)-1)]+all.times[2:length(all.times)])/2 - all.times<-all.times[2:length(all.times)]-all.times[1:(length(all.times)-1)] - - this.features<-matrix(0,nrow=1,ncol=5) - colnames(this.features)<-c("mz","pos","sd1","sd2","area") - n=1 - start=1 - nrowa<-nrow(a) - - a.breaks<-c(0, which(a[1:(nrowa-1),4] != a[2:nrowa,4]), nrowa) - mz.sd.rec<-NA - - for(nnn in 1:(length(a.breaks)-1)) + + if (is.na(min.bw)) min.bw <- diff(range(a[, 2], na.rm = TRUE)) / 60 + if (is.na(max.bw)) max.bw <- diff(range(a[, 2], na.rm = TRUE)) / 15 + if (min.bw >= max.bw) min.bw <- max.bw / 4 + + base.curve <- unique(a[, 2]) + all.span <- range(base.curve) + base.curve <- base.curve[order(base.curve)] + base.curve <- cbind(base.curve, base.curve * 0) + all.times <- base.curve[, 1] + if (all.times[1] > 0) all.times <- c(0, all.times) + all.times <- c(all.times, 2 * all.times[length(all.times)] - all.times[length(all.times) - 1]) + all.times <- (all.times[1:(length(all.times) - 1)] + all.times[2:length(all.times)]) / 2 + all.times <- all.times[2:length(all.times)] - all.times[1:(length(all.times) - 1)] + + this.features <- matrix(0, nrow = 1, ncol = 5) + colnames(this.features) <- c("mz", "pos", "sd1", "sd2", "area") + n <- 1 + start <- 1 + nrowa <- nrow(a) + + a.breaks <- c(0, which(a[1:(nrowa - 1), 4] != a[2:nrowa, 4]), nrowa) + mz.sd.rec <- NA + + for (nnn in 1:(length(a.breaks) - 1)) { - this<-a[(a.breaks[nnn]+1):a.breaks[nnn+1],] - if(is.null(nrow(this))) this<-matrix(this, nrow=1) - this<-this[order(this[,2]),] - if(is.null(nrow(this))) this<-matrix(this, nrow=1) - mz.sd.rec<-c(mz.sd.rec, sd(this[,1])) - - this.count.1<-nrow(this) - if(this.count.1<=10) - { - if(this.count.1>1) - { - this.inte<-interpol.area(this[,2], this[,3], base.curve[,1], all.times) - xxx<-c(median(this[,1]),median(this[,2]), sd(this[,2]), sd(this[,2]), this.inte) - }else{ - this.time.weights<-all.times[which(base.curve[,1] %in% this[2])] - xxx<-c(this[1], this[2], NA, NA, this[3]*this.time.weights) + this <- a[(a.breaks[nnn] + 1):a.breaks[nnn + 1], ] + if (is.null(nrow(this))) this <- matrix(this, nrow = 1) + this <- this[order(this[, 2]), ] + if (is.null(nrow(this))) this <- matrix(this, nrow = 1) + mz.sd.rec <- c(mz.sd.rec, sd(this[, 1])) + + this.count.1 <- nrow(this) + if (this.count.1 <= 10) { + if (this.count.1 > 1) { + this.inte <- interpol.area(this[, 2], this[, 3], base.curve[, 1], all.times) + xxx <- c(median(this[, 1]), median(this[, 2]), sd(this[, 2]), sd(this[, 2]), this.inte) + } else { + this.time.weights <- all.times[which(base.curve[, 1] %in% this[2])] + xxx <- c(this[1], this[2], NA, NA, this[3] * this.time.weights) } - this.features<-rbind(this.features,xxx) - }else{ - this.span<-range(this[,2]) - this.curve<-base.curve[base.curve[,1]>=this.span[1] & base.curve[,1] <=this.span[2],] - this.curve[this.curve[,1] %in% this[,2],2]<-this[,3] - - bw<-min(max(bandwidth*(this.span[2]-this.span[1]),min.bw), max.bw) - bw<-seq(bw, 2*bw, length.out=3) - if(bw[1] > 1.5*min.bw) bw<-c(max(min.bw, bw[1]/2), bw) - - if(shape.model == "Gaussian") - { - xxx<-normix.bic(this.curve[,1], this.curve[,2], power=power, bw=bw, eliminate=component.eliminate, BIC.factor=BIC.factor)$param - if(nrow(xxx) == 1) - { - xxx<-c(xxx[1,1:2],xxx[1,2],xxx[1,3]) - }else{ - xxx<-cbind(xxx[,1:2],xxx[,2],xxx[,3]) + this.features <- rbind(this.features, xxx) + } else { + this.span <- range(this[, 2]) + this.curve <- base.curve[base.curve[, 1] >= this.span[1] & base.curve[, 1] <= this.span[2], ] + this.curve[this.curve[, 1] %in% this[, 2], 2] <- this[, 3] + + bw <- min(max(bandwidth * (this.span[2] - this.span[1]), min.bw), max.bw) + bw <- seq(bw, 2 * bw, length.out = 3) + if (bw[1] > 1.5 * min.bw) bw <- c(max(min.bw, bw[1] / 2), bw) + + if (shape.model == "Gaussian") { + xxx <- normix.bic(this.curve[, 1], this.curve[, 2], power = power, bw = bw, eliminate = component.eliminate, BIC.factor = BIC.factor)$param + if (nrow(xxx) == 1) { + xxx <- c(xxx[1, 1:2], xxx[1, 2], xxx[1, 3]) + } else { + xxx <- cbind(xxx[, 1:2], xxx[, 2], xxx[, 3]) } - }else{ - xxx<-bigauss.mix(this.curve[,1], this.curve[,2], sigma.ratio.lim=sigma.ratio.lim, bw=bw, power=power, estim.method=estim.method, eliminate=component.eliminate, BIC.factor=BIC.factor)$param[,c(1,2,3,5)] + } else { + xxx <- bigauss.mix(this.curve[, 1], this.curve[, 2], sigma.ratio.lim = sigma.ratio.lim, bw = bw, power = power, estim.method = estim.method, eliminate = component.eliminate, BIC.factor = BIC.factor)$param[, c(1, 2, 3, 5)] } - - if(is.null(nrow(xxx))) - { - this.features<-rbind(this.features,c(median(this[,1]),xxx)) - }else{ - for(m in 1:nrow(xxx)) + + if (is.null(nrow(xxx))) { + this.features <- rbind(this.features, c(median(this[, 1]), xxx)) + } else { + for (m in 1:nrow(xxx)) { - this.d<-abs(this[,2]-xxx[m,1]) - this.features<-rbind(this.features, c(mean(this[which(this.d==min(this.d)),1]),xxx[m,])) + this.d <- abs(this[, 2] - xxx[m, 1]) + this.features <- rbind(this.features, c(mean(this[which(this.d == min(this.d)), 1]), xxx[m, ])) } } } - - #message(nnn) + + # message(nnn) } - this.features<-this.features[-1,] - this.features<-this.features[order(this.features[,1], this.features[,2]),] - this.features<-this.features[which(apply(this.features[,3:4],1,min)>sd.cut[1] & apply(this.features[,3:4],1,max) sd.cut[1] & apply(this.features[, 3:4], 1, max) < sd.cut[2]), ] + rownames(this.features) <- NULL + + if (do.plot) { + par(mfrow = c(2, 2)) + plot(c(-1, 1), c(-1, 1), type = "n", xlab = "", ylab = "", main = "", axes = FALSE) + text(x = 0, y = 0, "Estimate peak \n area/location", cex = 1.5) + hist(mz.sd.rec, xlab = "m/z SD", ylab = "Frequency", main = "m/z SD distribution") + hist(c(this.features[, 3], this.features[, 4]), xlab = "Retention time SD", ylab = "Frequency", main = "Retention time SD distribution") + hist(log10(this.features[, 5]), xlab = "peak strength (log scale)", ylab = "Frequency", main = "Peak strength distribution") } - + return(this.features) } diff --git a/R/unsupervised.R b/R/unsupervised.R index 2de6b8b3..61b2aabf 100644 --- a/R/unsupervised.R +++ b/R/unsupervised.R @@ -48,64 +48,6 @@ sort_samples_by_acquisition_number <- function (filenames) { sort(unlist(filenames)) } -extract_features <- function( - cluster, - filenames, - min_pres, - min_run, - mz_tol, - baseline_correct, - baseline_correct_noise_percentile, - intensity_weighted, - min_bandwidth, - max_bandwidth, - sd_cut, - sigma_ratio_lim, - shape_model, - peak_estim_method, - component_eliminate, - moment_power, - BIC_factor -) { - clusterExport(cluster, list( - 'proc.cdf', - 'prof.to.features', - 'load.lcms', - 'adaptive.bin', - 'find.turn.point', - 'combine.seq.3', - 'cont.index', - 'interpol.area' - )) - - parLapply(cluster, filenames, function(filename) { - profile <- proc.cdf( - filename = filename, - min.pres = min_pres, - min.run = min_run, - tol = mz_tol, - baseline.correct = baseline_correct, - baseline.correct.noise.percentile = baseline_correct_noise_percentile, - intensity.weighted = intensity_weighted, - do.plot = FALSE, - cache = FALSE - ) - features <- prof.to.features( - a = profile, - min.bw = min_bandwidth, - max.bw = max_bandwidth, - sd.cut = sd_cut, - sigma.ratio.lim = sigma_ratio_lim, - shape.model = shape_model, - estim.method = peak_estim_method, - component.eliminate = component_eliminate, - power = moment_power, - BIC.factor = BIC_factor, - do.plot = FALSE - ) - }) -} - align_features <- function(sample_names, ...) { aligned <- feature.align(...) feature_names <- seq_len(nrow(aligned$pk.times)) diff --git a/conda/environment-dev.yaml b/conda/environment-dev.yaml index 663bf593..7b7b3b83 100644 --- a/conda/environment-dev.yaml +++ b/conda/environment-dev.yaml @@ -26,4 +26,9 @@ dependencies: - r-tidyr - r-stringr - r-datacomparer - - r-tibble \ No newline at end of file + - r-tibble + - r-languageserver + - r-jsonlite + - r-styler + - r-datacomparer + - radian diff --git a/tests/testdata/extracted_features/extracted_0.parquet b/tests/testdata/extracted_features/extracted_0.parquet new file mode 100644 index 00000000..7a8661b8 Binary files /dev/null and b/tests/testdata/extracted_features/extracted_0.parquet differ diff --git a/tests/testdata/extracted_features/extracted_1.parquet b/tests/testdata/extracted_features/extracted_1.parquet new file mode 100644 index 00000000..7a8661b8 Binary files /dev/null and b/tests/testdata/extracted_features/extracted_1.parquet differ diff --git a/tests/testdata/extracted_features/extracted_2.parquet b/tests/testdata/extracted_features/extracted_2.parquet new file mode 100644 index 00000000..7a8661b8 Binary files /dev/null and b/tests/testdata/extracted_features/extracted_2.parquet differ diff --git a/tests/testdata/proc_cdf_expected.Rds b/tests/testdata/proc_cdf_expected.Rds new file mode 100644 index 00000000..b64a84d8 Binary files /dev/null and b/tests/testdata/proc_cdf_expected.Rds differ diff --git a/tests/testdata/prof_to_features_expected.Rds b/tests/testdata/prof_to_features_expected.Rds new file mode 100644 index 00000000..97791b3b Binary files /dev/null and b/tests/testdata/prof_to_features_expected.Rds differ diff --git a/tests/testthat/test-extract_features.R b/tests/testthat/test-extract_features.R new file mode 100644 index 00000000..051d1f44 --- /dev/null +++ b/tests/testthat/test-extract_features.R @@ -0,0 +1,33 @@ +test_that("extract single feature works", { + filename <- c("../testdata/mbr_test0.mzml") + + extracted_features <- proc.cdf(filename, cache = FALSE) + actual <- prof.to.features(extracted_features, do.plot = FALSE) + + expected <- as.data.frame(arrow::read_parquet("../testdata/extracted_features/extracted_0.parquet")) + actual <- as.data.frame(actual) + + actual <- unique(actual) + expected <- unique(expected) + + keys <- c("mz", "pos", "sd1", "sd2") + + actual <- dplyr::arrange_at( + actual, keys + ) + expected <- dplyr::arrange_at( + expected, keys + ) + + report <- dataCompareR::rCompare(actual, expected, keys = keys, roundDigits = 4, mismatches = 100000) + dataCompareR::saveReport(report, reportName = "extract_features_0", showInViewer = FALSE, HTMLReport = FALSE, mismatchCount = 10000) + + expect_true(nrow(report$rowMatching$inboth) >= 0.9 * nrow(expected)) + + incommon <- as.numeric(rownames(report$rowMatching$inboth)) + + subset_actual <- actual %>% dplyr::slice(incommon) + subset_expected <- expected %>% dplyr::slice(incommon) + + expect_equal(subset_actual$area, subset_expected$area, tolerance = 0.01 * max(expected$area)) +}) diff --git a/tests/testthat/test-proc.cdf.R b/tests/testthat/test-proc.cdf.R new file mode 100644 index 00000000..5d987406 --- /dev/null +++ b/tests/testthat/test-proc.cdf.R @@ -0,0 +1,6 @@ +test_that("test proc.cdf test", { + filename <- c('../testdata/mbr_test0.mzml') + actual <- proc.cdf(filename, cache = FALSE) + expected <- readRDS('../testdata/proc_cdf_expected.Rds') + expect_equal(actual, expected) +}) \ No newline at end of file diff --git a/tests/testthat/test-prof.to.features.R b/tests/testthat/test-prof.to.features.R new file mode 100644 index 00000000..5b43dcdf --- /dev/null +++ b/tests/testthat/test-prof.to.features.R @@ -0,0 +1,11 @@ +test_that("prof.to.features works", { + extracted_features <- readRDS('../testdata/proc_cdf_expected.Rds') + actual <- prof.to.features( + extracted_features, + sd.cut = c(0.1, 100), + sigma.ratio.lim = c(0.1, 10), + do.plot = FALSE) + + expected <- readRDS("../testdata/prof_to_features_expected.Rds") + expect_equal(actual, expected) +}) \ No newline at end of file