Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rewrite topN and remove getTopIdx/subsetBy #199

Merged
merged 11 commits into from
Mar 22, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# MSnbase 2.1

## Changes in version 2.1.14
- (nothing yet)
- Internal rewrite and speedup of topN <2017-03-20 Mon>

## Changes in version 2.1.13
- Internal rewrite and speedup of plotNA <2017-02-26 Sun>
Expand Down
48 changes: 13 additions & 35 deletions R/methods-MSnSet.R
Original file line number Diff line number Diff line change
Expand Up @@ -337,56 +337,34 @@ setMethod("combine",


setMethod("topN", signature(object = "matrix"),
function(object, groupBy, n=3, fun, ...) {
function(object, groupBy, n=3, fun, ..., verbose=isMSnbaseVerbose()) {
if (missing(groupBy))
stop("Specify how to group features to select top ", n, ".")
if (missing(fun)) {
fun <- sum
if (ncol(object) > 1)
if (ncol(object) > 1 && verbose)
message("Ranking features using their sum.")
}
rn <- rownames(object)
idx <- by(object, groupBy, getTopIdx, n, fun, ...)
object <- subsetBy(object, groupBy, idx)
if (!is.null(rn)) {
rownames(object) <- subsetBy(rn, groupBy, idx)
} else {
rownames(object) <- NULL
}
return(object)
object[.topIdx(object, groupBy=groupBy, n=n, fun=fun, ...), ]
})


setMethod("topN", signature(object = "MSnSet"),
function(object, groupBy, n=3, fun, ...) {
function(object, groupBy, n=3, fun, ..., verbose=isMSnbaseVerbose()) {
if (missing(groupBy))
stop("Specify how to group features to select top ", n, ".")
if (missing(fun)) {
fun <- sum
if (ncol(object) > 1)
if (ncol(object) > 1 && verbose)
message("Ranking features using their sum.")
}
idx <- by(exprs(object), groupBy, getTopIdx, n, fun, ...)
fn <- subsetBy(featureNames(object), groupBy, idx)
.eset <- subsetBy(exprs(object), groupBy, idx)
if (!is.matrix(.eset))
.eset <- matrix(.eset, ncol = 1)
rownames(.eset) <- fn
.proc <- processingData(object)
.proc@processing <- c(.proc@processing,
paste0("Selected top ", n,
" features: ", date()))
.fdata <- subsetBy(fData(object), groupBy, idx)
ans <- new("MSnSet",
experimentData = experimentData(object),
exprs = .eset,
phenoData = phenoData(object),
featureData = new("AnnotatedDataFrame", data = .fdata),
annotation = object@annotation,
protocolData = protocolData(object))
ans@processingData <- .proc
featureNames(ans) <- fn
if (validObject(ans))
return(ans)
idx <- .topIdx(exprs(object), groupBy=groupBy, n=n, fun=fun, ...)
object@processingData@processing <- c(processingData(object)@processing,
paste0("Selected top ", n,
" features: ", date()))
object <- object[idx]
if (validObject(object))
return(object)
})


Expand Down
73 changes: 41 additions & 32 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -289,40 +289,49 @@ getVariableName <- function(match_call, varname) {
tail(as.character(mcx), n = 1)
}



##
## utils for topN method: getTopIdx and subsetBy
##

getTopIdx <- function(X, n, fun, ...) {
## Rows of X are first summerised using fun.
## Indices of the n highest values of vector X
## are then returned.
## input X: matrix [m,l]
## output: numeric of length min(n, nrow(x))
## If (l == 1), fun does not have any effect.
## Otherwise, fun is required to keep the features
## grouped into rows.
if (n < 1)
stop("'n' must be greater or equal than 1.")
n <- min(n, nrow(X))
X <- apply(X, 1, fun, ...)
base::order(X, decreasing = TRUE)[1:n]
}

subsetBy <- function(X, groups, byIdx) {
if ( is.null(dim(X)) || ncol(X) == 1 ) {
## vector
unlist(mapply("[", x=split(as.vector(X), groups), i=byIdx,
SIMPLIFY=FALSE, USE.NAMES=FALSE))
#' summarise rows by an user-given function
#'
#' @param x matrix
#' @param fun function to summarise rows, if \code{fun} equals
#' \code{sum}/\code{mean} the more efficient \code{rowSums}/\code{rowMeans} are
#' used.
#' @param ... further arguments passed to \code{fun}
#' @return double, summarised rows
#' @noRd
.summariseRows <- function(x, fun, ...) {
stopifnot(is.matrix(x))
stopifnot(is.function(fun))

if (identical(fun, sum)) {
rowSums(x, ...)
} else if (identical(fun, mean)) {
rowMeans(x, ...)
} else {
## matrix
ans <- mapply(function(i, j) {
X[i, , drop=FALSE][j, , drop=FALSE]
}, i=split(1:nrow(X), groups), j=byIdx, SIMPLIFY=FALSE, USE.NAMES=FALSE)
do.call(rbind, ans)
apply(x, 1L, fun, ...)
}
}

#' find top n indices of each group
#'
#' @param x matrix
#' @param groupBy factor/character of length \code{nrow(x)}
#' @param n consider just the top \code{n} values
#' @param fun function to summarise rows
#' @param ... further arguments passed to \code{fun}
#' @return double, indices sorted by summarising function \code{fun}
#' @noRd
.topIdx <- function(x, groupBy, n, fun, ...) {
if (n < 1) {
stop(sQuote("n"), " has to be greater or equal than 1.")
}
if (nrow(x) != length(groupBy)) {
stop(sQuote("nrow(x)"), " and ", sQuote("length(groupBy)"),
" have to be equal.")
}
rs <- .summariseRows(x, fun, ...)
o <- order(as.double(rs), decreasing=TRUE, na.last=TRUE)
idx <- unlist(lapply(split(o, groupBy[o]), "[", 1:n), use.names=FALSE)
idx[!is.na(idx)]
}

## Computes header from assay data by-passing cache
Expand Down
5 changes: 2 additions & 3 deletions man/MSnSet-class.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,8 @@
2 or more \code{MSnSet} instances according to their feature names.
Note that the \code{qual} slot and the processing information are
silently dropped. }
\item{topN}{signature(object = "MSnSet", groupBy, n = 3, fun, ...)}{
\item{topN}{signature(object = "MSnSet", groupBy, n = 3, fun, ..., verbose =
isMSnbaseVerbose())}{
Selects the \code{n} most intense features (typically peptides or
spectra) out of all available for each set defined by
\code{groupBy} (typically proteins) and creates a new instance of
Expand All @@ -239,8 +240,6 @@
decreasing order. Additional parameters can be passed to
\code{fun} through \code{...}, for instance to control the
behaviour of \code{topN} in case of \code{NA} values.
Note that the \code{qual} slot and the processing information are
silently dropped.
(Works also with \code{matrix} instances.)

See also the \code{\link{nQuants}} function to retrieve the
Expand Down
26 changes: 26 additions & 0 deletions tests/testthat/test_utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -262,3 +262,29 @@ test_that("Get first MS level", {
expect_equivalent(y1, y2)
})

test_that(".summariseRows", {
m <- matrix(1:30, nrow=10)
m[seq(2, 30, by=3)] <- NA
expect_error(MSnbase:::.summariseRows(1:10, fun=sum))
expect_error(MSnbase:::.summariseRows(m, fun=1:10))
expect_equal(MSnbase:::.summariseRows(m, fun=sum), rep.int(NA_real_, 10))
expect_equal(MSnbase:::.summariseRows(m, fun=sum, na.rm=TRUE), rowSums(m, na.rm=TRUE))
expect_equal(MSnbase:::.summariseRows(m, fun=mean, na.rm=TRUE), rowMeans(m, na.rm=TRUE))
expect_equal(MSnbase:::.summariseRows(m, fun=max, na.rm=TRUE), c(21, 22, 13, 24, 25, 16, 27, 28, 19, 30))
})

test_that(".topIdx", {
m <- matrix(1:30, nrow=10)
g <- rep_len(LETTERS[1:3], 10)
expect_error(MSnbase:::.topIdx(m, groupBy=g, fun=sum)) # n missing
expect_error(MSnbase:::.topIdx(1:10, groupBy=g, fun=sum, n=3))
expect_error(MSnbase:::.topIdx(m, groupBy=g, fun=1:10, n=3))
expect_error(MSnbase:::.topIdx(m, groupBy=g, fun=sum, n=-1),
".*n.* has to be greater or equal than 1.")
expect_error(MSnbase:::.topIdx(m, groupBy=1:3, fun=sum, n=3),
".*nrow.*x.* and .*length.*groupBy.* have to be equal.")
expect_equal(MSnbase:::.topIdx(m, groupBy=g, fun=sum, n=3),
c(10, 7, 4, 8, 5, 2, 9, 6, 3))
expect_equal(MSnbase:::.topIdx(m, groupBy=g, fun=sum, n=2),
c(10, 7, 8, 5, 9, 6))
})