lgatto · lgatto · Mar 22, 2017 · Mar 20, 2017 · Mar 20, 2017 · Mar 20, 2017
diff --git a/NEWS.md b/NEWS.md
@@ -1,7 +1,7 @@
 # MSnbase 2.1
 
 ## Changes in version 2.1.14
-- (nothing yet)
+- Internal rewrite and speedup of topN <2017-03-20 Mon>
 
 ## Changes in version 2.1.13
 - Internal rewrite and speedup of plotNA <2017-02-26 Sun>

diff --git a/R/methods-MSnSet.R b/R/methods-MSnSet.R
@@ -337,56 +337,34 @@ setMethod("combine",
 
 
 setMethod("topN", signature(object = "matrix"),
-          function(object, groupBy, n=3, fun, ...) {
+          function(object, groupBy, n=3, fun, ..., verbose=isMSnbaseVerbose()) {
             if (missing(groupBy))
               stop("Specify how to group features to select top ", n, ".")
             if (missing(fun)) {
               fun <- sum
-              if (ncol(object) > 1)
+              if (ncol(object) > 1 && verbose)
                 message("Ranking features using their sum.")
             }
-            rn <- rownames(object)
-            idx <- by(object, groupBy, getTopIdx, n, fun, ...)
-            object <- subsetBy(object, groupBy, idx)
-            if (!is.null(rn)) {
-              rownames(object) <- subsetBy(rn, groupBy, idx)
-            } else {
-              rownames(object) <- NULL
-            }
-            return(object)
+            object[.topIdx(object, groupBy=groupBy, n=n, fun=fun, ...), ]
           })
 
+
 setMethod("topN", signature(object = "MSnSet"),
-          function(object, groupBy, n=3, fun, ...) {
+          function(object, groupBy, n=3, fun, ..., verbose=isMSnbaseVerbose()) {
             if (missing(groupBy))
               stop("Specify how to group features to select top ", n, ".")
             if (missing(fun)) {
               fun <- sum
-              if (ncol(object) > 1)
+              if (ncol(object) > 1 && verbose)
                 message("Ranking features using their sum.")
             }
-            idx <- by(exprs(object), groupBy, getTopIdx, n, fun, ...)
-            fn <- subsetBy(featureNames(object), groupBy, idx)
-            .eset <- subsetBy(exprs(object), groupBy, idx)
-            if (!is.matrix(.eset))
-              .eset <- matrix(.eset, ncol = 1)
-            rownames(.eset) <- fn
-            .proc <- processingData(object)
-            .proc@processing <- c(.proc@processing,
-                                  paste0("Selected top ", n,
-                                         " features: ", date()))
-            .fdata <- subsetBy(fData(object), groupBy, idx)
-            ans <- new("MSnSet",
-                       experimentData = experimentData(object),
-                       exprs = .eset,
-                       phenoData = phenoData(object),
-                       featureData = new("AnnotatedDataFrame", data = .fdata),
-                       annotation = object@annotation,
-                       protocolData = protocolData(object))
-            ans@processingData <- .proc
-            featureNames(ans) <- fn
-            if (validObject(ans))
-              return(ans)
+            idx <- .topIdx(exprs(object), groupBy=groupBy, n=n, fun=fun, ...)
+            object@processingData@processing <- c(processingData(object)@processing,
+                                                  paste0("Selected top ", n,
+                                                  " features: ", date()))
+            object <- object[idx]
+            if (validObject(object))
+              return(object)
           })
 
 

diff --git a/R/utils.R b/R/utils.R
@@ -289,40 +289,49 @@ getVariableName <- function(match_call, varname) {
   tail(as.character(mcx), n = 1)
 }
 
-
-
-##
-## utils for topN method: getTopIdx and subsetBy
-##
-
-getTopIdx <- function(X, n, fun, ...) {
-  ## Rows of X are first summerised using fun.
-  ## Indices of the n highest values of vector X
-  ## are then returned.
-  ## input X: matrix [m,l]
-  ## output: numeric of length min(n, nrow(x))
-  ## If (l == 1), fun does not have any effect.
-  ## Otherwise, fun is required to keep the features
-  ## grouped into rows.
-  if (n < 1)
-    stop("'n' must be greater or equal than 1.")
-  n <- min(n, nrow(X))
-  X <- apply(X, 1, fun, ...)
-  base::order(X, decreasing = TRUE)[1:n]
-}
-
-subsetBy <- function(X, groups, byIdx) {
-  if ( is.null(dim(X)) || ncol(X) == 1 ) {
-    ## vector
-    unlist(mapply("[", x=split(as.vector(X), groups), i=byIdx,
-                  SIMPLIFY=FALSE, USE.NAMES=FALSE))
+#' summarise rows by an user-given function
+#'
+#' @param x matrix
+#' @param fun function to summarise rows, if \code{fun} equals
+#' \code{sum}/\code{mean} the more efficient \code{rowSums}/\code{rowMeans} are
+#' used.
+#' @param ... further arguments passed to \code{fun}
+#' @return double, summarised rows
+#' @noRd
+.summariseRows <- function(x, fun, ...) {
+  stopifnot(is.matrix(x))
+  stopifnot(is.function(fun))
+
+  if (identical(fun, sum)) {
+    rowSums(x, ...)
+  } else if (identical(fun, mean)) {
+    rowMeans(x, ...)
   } else {
-    ## matrix
-    ans <- mapply(function(i, j) {
-      X[i, , drop=FALSE][j, , drop=FALSE]
-    }, i=split(1:nrow(X), groups), j=byIdx, SIMPLIFY=FALSE, USE.NAMES=FALSE)
-    do.call(rbind, ans)
+    apply(x, 1L, fun, ...)
+  }
+}
+
+#' find top n indices of each group
+#'
+#' @param x matrix
+#' @param groupBy factor/character of length \code{nrow(x)}
+#' @param n consider just the top \code{n} values
+#' @param fun function to summarise rows
+#' @param ... further arguments passed to \code{fun}
+#' @return double, indices sorted by summarising function \code{fun}
+#' @noRd
+.topIdx <- function(x, groupBy, n, fun, ...) {
+  if (n < 1) {
+    stop(sQuote("n"), " has to be greater or equal than 1.")
+  }
+  if (nrow(x) != length(groupBy)) {
+    stop(sQuote("nrow(x)"), " and ", sQuote("length(groupBy)"),
+         " have to be equal.")
   }
+  rs <- .summariseRows(x, fun, ...)
+  o <- order(as.double(rs), decreasing=TRUE, na.last=TRUE)
+  idx <- unlist(lapply(split(o, groupBy[o]), "[", 1:n), use.names=FALSE)
+  idx[!is.na(idx)]
 }
 
 ## Computes header from assay data by-passing cache

diff --git a/man/MSnSet-class.Rd b/man/MSnSet-class.Rd
@@ -229,7 +229,8 @@
       2 or more \code{MSnSet} instances according to their feature names.
       Note that the \code{qual} slot and the processing information are
       silently dropped. }
-    \item{topN}{signature(object = "MSnSet", groupBy, n = 3, fun, ...)}{
+    \item{topN}{signature(object = "MSnSet", groupBy, n = 3, fun, ..., verbose =
+      isMSnbaseVerbose())}{
       Selects the \code{n} most intense features (typically peptides or
       spectra) out of all available for each set defined by
       \code{groupBy} (typically proteins) and creates a new instance of
@@ -239,8 +240,6 @@
       decreasing order. Additional parameters can be passed to
       \code{fun} through \code{...}, for instance to control the
       behaviour of \code{topN} in case of \code{NA} values.
-      Note that the \code{qual} slot and the processing information are
-      silently dropped.
       (Works also with \code{matrix} instances.)
 
       See also the \code{\link{nQuants}} function to retrieve the

diff --git a/tests/testthat/test_utils.R b/tests/testthat/test_utils.R
@@ -262,3 +262,29 @@ test_that("Get first MS level", {
     expect_equivalent(y1, y2)
 })
 
+test_that(".summariseRows", {
+  m <- matrix(1:30, nrow=10)
+  m[seq(2, 30, by=3)] <- NA
+  expect_error(MSnbase:::.summariseRows(1:10, fun=sum))
+  expect_error(MSnbase:::.summariseRows(m, fun=1:10))
+  expect_equal(MSnbase:::.summariseRows(m, fun=sum), rep.int(NA_real_, 10))
+  expect_equal(MSnbase:::.summariseRows(m, fun=sum, na.rm=TRUE), rowSums(m, na.rm=TRUE))
+  expect_equal(MSnbase:::.summariseRows(m, fun=mean, na.rm=TRUE), rowMeans(m, na.rm=TRUE))
+  expect_equal(MSnbase:::.summariseRows(m, fun=max, na.rm=TRUE), c(21, 22, 13, 24, 25, 16, 27, 28, 19, 30))
+})
+
+test_that(".topIdx", {
+  m <- matrix(1:30, nrow=10)
+  g <- rep_len(LETTERS[1:3], 10)
+  expect_error(MSnbase:::.topIdx(m, groupBy=g, fun=sum)) # n missing
+  expect_error(MSnbase:::.topIdx(1:10, groupBy=g, fun=sum, n=3))
+  expect_error(MSnbase:::.topIdx(m, groupBy=g, fun=1:10, n=3))
+  expect_error(MSnbase:::.topIdx(m, groupBy=g, fun=sum, n=-1),
+               ".*n.* has to be greater or equal than 1.")
+  expect_error(MSnbase:::.topIdx(m, groupBy=1:3, fun=sum, n=3),
+               ".*nrow.*x.* and .*length.*groupBy.* have to be equal.")
+  expect_equal(MSnbase:::.topIdx(m, groupBy=g, fun=sum, n=3),
+               c(10, 7, 4, 8, 5, 2, 9, 6, 3))
+  expect_equal(MSnbase:::.topIdx(m, groupBy=g, fun=sum, n=2),
+               c(10, 7, 8, 5, 9, 6))
+})