diff --git a/.travis.yml b/.travis.yml index 8dce1816..65867026 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,15 +14,15 @@ language: r cache: packages # R versions to be tested on -r: +r: - bioc-release - bioc-devel ## Turn this to true before submission to CRAN/Bioconductor warnings_are_errors: false -r_build_args: "--no-build-vignettes" -r_check_args: "--no-vignettes" +# r_build_args: "--no-build-vignettes" +# r_check_args: "--no-vignettes" notifications: email: diff --git a/DESCRIPTION b/DESCRIPTION index f2107bd8..e61597c6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: clusterExperiment Title: Compare Clusterings for Single-Cell Sequencing -Version: 1.3.0 +Version: 1.3.1 Description: Provides functionality for running and comparing many different clusterings of single-cell sequencing data or other large mRNA Expression data sets. Authors@R: c(person("Elizabeth", "Purdom", email = "epurdom@stat.berkeley.edu", @@ -13,9 +13,9 @@ BugReports: https://github.com/epurdom/clusterExperiment/issues License: Artistic-2.0 Depends: R (>= 3.3), - methods, SummarizedExperiment Imports: + methods, NMF, RColorBrewer, ape, @@ -29,7 +29,8 @@ Imports: matrixStats, graphics, parallel, - MAST + MAST, + RSpectra Suggests: BiocStyle, knitr, diff --git a/NAMESPACE b/NAMESPACE index dae52cab..45925c2b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -24,6 +24,7 @@ exportMethods("clusterTypes<-") exportMethods("coClustering<-") exportMethods("orderSamples<-") exportMethods("primaryClusterIndex<-") +exportMethods("transformation<-") exportMethods(RSEC) exportMethods(addClusters) exportMethods(clusterContrasts) @@ -39,6 +40,7 @@ exportMethods(clusterTypes) exportMethods(coClustering) exportMethods(combineMany) exportMethods(convertClusterLegend) +exportMethods(dendroClusterIndex) exportMethods(getBestFeatures) exportMethods(makeDendrogram) exportMethods(mergeClusters) @@ -73,6 +75,8 @@ importFrom(MAST,Hypothesis) importFrom(NMF,aheatmap) importFrom(RColorBrewer,brewer.pal) importFrom(RColorBrewer,brewer.pal.info) +importFrom(RSpectra,svds) +importFrom(ape,phydataplot) importFrom(ape,plot.phylo) importFrom(cluster,daisy) importFrom(cluster,pam) @@ -87,8 +91,13 @@ importFrom(matrixStats,rowVars) importFrom(parallel,mclapply) importFrom(phylobase,ancestors) importFrom(phylobase,descendants) +importFrom(phylobase,edgeLength) importFrom(phylobase,getNode) importFrom(phylobase,labels) +importFrom(phylobase,nNodes) +importFrom(phylobase,nodeLabels) +importFrom(phylobase,rootNode) +importFrom(phylobase,subset) importFrom(stats,dist) importFrom(stats,hclust) importFrom(stats,mad) diff --git a/NEWS b/NEWS index f585e371..baf83270 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,25 @@ +Changes in version 1.3.1 ( Release date: 2017-06-14 ) +============== +Changes: +* change how `plotHeatmap` handles visualizeData argument, so not required to have same number of genes as original, only same number of samples. +* Now if color of vectors given in `clusterLegend` does not have names, `plotHeatmap` will give them names matching the variable so that they will be used by `aheatmap` (previously would have left all colors white because do not have matching names). +* Large changes to how dendrograms are plotted by `plotDendrogram` and `mergeClusters`. This includes the ability to see the before and after clusterings along side the mergeClusters result, as well as a new slot added to the clusterExperiment class (`dendro_outbranch`). The names of several arguments to `mergeClusters` and `plotDendrogram` were changed for clarity: + - `leaves` is now `leafType` in `plotDendrogram`. + - `plotType` is now `plotInfo` in `mergeClusters` + - `doPlot` is now `plot` in `mergeClusters` + - `leafType` is now an option for `mergeClusters` as well. + - Now when `plotInfo` (previously `plotType`) is set to `none`, the plot is still drawn, but just no information about the merging is added to the plot. To not plot the dendrogram at all, set `plot=FALSE`. + - The option `labelType` in either `plotDendrogram` or `mergeClusters` controls whether names (`name`) or rectangular color blocks corresponding to the cluster (`colorblock`) are put at the tips of the dendrogram to label the clusters/samples. +* added `dendroClusterIndex` that behaves similarly to `primaryClusterIndex` +* added ability to give `dendro` as charater option to `whichClusters` argument +* added `transformation<-` to be able to assign manually the transformation slot +* Move MAST into 'suggests' pacakge so that not need R 3.4 to run the package. +* Change calculation of PCA dimensionality reduction to use `svds` from `RSpectra` package to improve speed + +Bugs: +* Fixed bug in RSEC where `combineProportion` argument was being ignored (set to 1) +* Fixed bug in definition of `transform` so that extends existing generic rather than masking it. + Changes in version 1.3.0 ( Release date: 2017-05-24 ) ============== Changes: @@ -5,10 +27,13 @@ Changes: * Added `plotBarplot` to plot a barplot for 1 cluster or comparison of 2 clusters along with tests. * Added `whichClusters` argument to `clusterMatrix` to return only clusters corresponding to certain clusters. Mainly relevant for using arguments like `workflow` that are used by other commands (otherwise could just index the complete matrix manually...) + Bug fixes: * `plotHeatmap` now goes through the `clusterLegend` input and removes levels that do not exist in the sampleData; this was causing incorrect coloring when the `clusterLegend` had more (or less) levels that it assigned color to than the `sampleData` did (e.g. if `sampleData` was a subset of larger dataset upon which the original colors were assigned.) NOTE: that this now has the effect of NOT plotting all values in the clusterLegend if they are not represented in the data, thus changing the previous behavior of `plotHeatmap` legend. * fixed bug in how `plotHeatmap` checked that the dimensions of user-supplied dendrogram match that of data (matrix version). * fixed `convertClusterLegend` so when `output` is `matrixNames` or `matrixColors`, the resulting matrix has the `colnames` equal to cluster labels, like `clusterMatrix`. +* internal function .convertToNum now preserves names of input vector. +* fixed bug in plotting with merge clusters; previously if plotType="all", might not have been correctly plotted with the right internal node of the dendrogram. Changes in version 1.2.0 ( Release date: 2017-04-04 ) ============== diff --git a/R/AllClasses.R b/R/AllClasses.R index 449c47fb..ff424f70 100644 --- a/R/AllClasses.R +++ b/R/AllClasses.R @@ -52,6 +52,8 @@ setClassUnion("matrixOrMissing",members=c("matrix", "missing")) #' details). #' @slot dendro_index numeric. An integer giving the cluster that was used to #' make the dendrograms. NA_real_ value if no dendrograms are saved. +#' @slot dendro_outbranch logical. Whether the dendro_samples dendrogram put +#' missing/non-clustered samples in an outbranch, or intermixed in the dendrogram. #' @slot coClustering matrix. A matrix with the cluster co-occurrence #' information; this can either be based on subsampling or on co-clustering #' across parameter sets (see \code{clusterMany}). The matrix is a square matrix @@ -85,6 +87,7 @@ setClass( dendro_samples = "dendrogramOrNULL", dendro_clusters = "dendrogramOrNULL", dendro_index = "numeric", + dendro_outbranch = "logical", coClustering = "matrixOrNULL", clusterLegend="list", orderSamples="numeric" @@ -129,20 +132,23 @@ setValidity("ClusterExperiment", function(object) { if(NCOL(object@clusterMatrix)!= length(object@clusterInfo)) { return("length of clusterInfo must be same as NCOL of the clusterMatrix") } - - ##Check dendrograms + ############ + ##Check dendrogram slotNames + ############ #browser() if(!is.null(object@dendro_samples)){ if(nobs(object@dendro_samples) != NCOL(object)) { return("dendro_samples must have the same number of leaves as the number of samples") } + if(is.na(object@dendro_outbranch)) return("if dendro_samples is defined, must also define dendro_outbranch") } else{ if(!is.null(object@dendro_clusters)) return("dendro_samples should not be null if dendro_clusters is non-null") + if(!is.na(object@dendro_outbranch)) return("dendro_samples should not be null if dendro_outbranch is not NA") } if(!is.null(object@dendro_clusters)){ - if(is.na(object@dendro_index)) return("if dendrogram slots are filled, must have corresponding dendro_index defined.") - dcluster<-clusterMatrix(object)[,object@dendro_index] + if(is.na(dendroClusterIndex(object))) return("if dendrogram slots are filled, must have corresponding dendro_index defined.") + dcluster<-clusterMatrix(object)[,dendroClusterIndex(object)] if(nobs(object@dendro_clusters) != max(dcluster)) { return("dendro_clusters must have the same number of leaves as the number of (non-negative) clusters") } @@ -150,11 +156,13 @@ setValidity("ClusterExperiment", function(object) { else{ if(!is.null(object@dendro_samples)) return("dendro_clusters should not be null if dendro_samples is non-null") } + ## Check co-clustering if(!is.null(object@coClustering) && (NROW(object@coClustering) != NCOL(object@coClustering) | NCOL(object@coClustering) != NCOL(object))) { return("`coClustering` must be a sample by sample matrix.") } + ## If have a cluster matrix if(!all(is.na(object@clusterMatrix))){ #what does this mean, how can they be all NA? #check primary index if(length(object@primaryIndex) != 1) { @@ -331,6 +339,7 @@ setMethod( #'@param dendro_samples dendrogram. Sets the `dendro_samples` slot (see Slots). #'@param dendro_clusters dendrogram. Sets the `dendro_clusters` slot (see #' Slots). +#' @param dendro_outbranch logical. Sets the `dendro_outbranch` slot (see Slots) #'@param dendro_index numeric. Sets the dendro_index slot (see Slots). #'@param coClustering matrix. Sets the `coClustering` slot (see Slots). #'@details The \code{clusterExperiment} constructor function gives clusterLabels @@ -348,6 +357,7 @@ setMethod( dendro_samples=NULL, dendro_index=NA_real_, dendro_clusters=NULL, + dendro_outbranch=NA, coClustering=NULL ){ if(NCOL(se) != nrow(clusters)) { @@ -396,6 +406,7 @@ setMethod( dendro_samples=dendro_samples, dendro_clusters=dendro_clusters, dendro_index=dendro_index, + dendro_outbranch=dendro_outbranch, coClustering=coClustering ) validObject(out) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index ff0c03e0..4c6535bf 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -192,13 +192,20 @@ setGeneric( standardGeneric("transformation") } ) - setGeneric( - name = "transform", - def = function(x,...) { - standardGeneric("transform") + name = "transformation<-", + def = function(object, value) { + standardGeneric("transformation<-") } ) +# don't need this because a standard generic already exists +# setGeneric( +# name = "transform", +# def = function(x,...) { +# standardGeneric("transform") +# } +# ) +setGeneric("transform") setGeneric( name = "clusterMatrix", @@ -228,6 +235,12 @@ setGeneric( } ) +setGeneric( + name = "dendroClusterIndex", + def = function(x) { + standardGeneric("dendroClusterIndex") + } +) setGeneric( name = "coClustering", def = function(x) { diff --git a/R/AllHelper.R b/R/AllHelper.R index 65f56e84..9306d8ba 100644 --- a/R/AllHelper.R +++ b/R/AllHelper.R @@ -91,7 +91,7 @@ setMethod( cat("Table of clusters (of primary clustering):") print(table(primaryClusterNamed(object))) cat("Total number of clusterings:", NCOL(clusterMatrix(object)),"\n") - if(!is.na(object@dendro_index) ) cat("Dendrogram run on '",clusterLabels(object)[object@dendro_index],"' (cluster index: ", object@dendro_index,")\n",sep="") else cat("No dendrogram present\n") + if(!is.na(dendroClusterIndex(object)) ) cat("Dendrogram run on '",clusterLabels(object)[dendroClusterIndex(object)],"' (cluster index: ", dendroClusterIndex(object),")\n",sep="") else cat("No dendrogram present\n") cat("-----------\n") cat("Workflow progress:\n") typeTab<-names(table(clusterTypes(object))) @@ -148,6 +148,19 @@ setMethod( } ) +#' @rdname ClusterExperiment-methods +#' @export +#' @aliases transformation<- +setReplaceMethod( + f = "transformation", + signature = signature("ClusterExperiment", "function"), + definition = function(object, value) { + object@transformation <- value + validObject(object) + return(object) + } +) + #' @rdname ClusterExperiment-methods #' @return \code{nClusters} returns the number of clusterings (i.e., ncol of #' clusterMatrix). @@ -252,6 +265,20 @@ setMethod( } ) +#' @rdname ClusterExperiment-methods +#' @return \code{dendroClusterIndex} returns/sets the clustering index +#' of the clusters used to create dendrogram +#' (i.e., which column of clusterMatrix corresponds to the clustering). +#' @export +#' @aliases dendroClusterIndex +setMethod( + f = "dendroClusterIndex", + signature = "ClusterExperiment", + definition = function(x) { + return(x@dendro_index) + } +) + #' @rdname ClusterExperiment-methods #' @export #' @aliases primaryClusterIndex<- diff --git a/R/addClusters.R b/R/addClusters.R index 4a402f4c..dd9efa8a 100644 --- a/R/addClusters.R +++ b/R/addClusters.R @@ -118,15 +118,17 @@ setMethod( newClusterColors<-clusterLegend(x)[-whichRemove] dend_samples <- x@dendro_samples dend_cl <- x@dendro_clusters - dend_ind<-x@dendro_index + dend_ind<-dendroClusterIndex(x) + dend_out<-x@dendro_outbranch coMat<-x@coClustering orderSamples<-orderSamples(x) if(primaryClusterIndex(x) %in% whichRemove) pIndex<-1 else pIndex<-match(primaryClusterIndex(x),(1:NCOL(clusterMatrix(x)))[-whichRemove]) - if(x@dendro_index %in% whichRemove){ + if(dendroClusterIndex(x) %in% whichRemove){ dend_cl<-NULL dend_samples<-NULL dend_ind<-NA_real_ + dend_out<-NA } else{ dend_ind<-match(dend_ind,(1:NCOL(clusterMatrix(x)))[-whichRemove]) @@ -139,6 +141,7 @@ setMethod( dendro_samples=dend_samples, dendro_clusters=dend_cl, dendro_index=dend_ind, + dendro_outbranch=dend_out, coClustering=coMat, orderSamples=orderSamples ) diff --git a/R/clusterMany.R b/R/clusterMany.R index acc34026..5709d64c 100644 --- a/R/clusterMany.R +++ b/R/clusterMany.R @@ -11,15 +11,15 @@ #' \code{SummarizedExperiment} object, or a \code{ClusterExperiment} object. #' @param ks the range of k values (see details for meaning for different #' choices). -#' @param alphas values of alpha to be tried. Only used for clusterFunctions of -#' type '01' (either 'tight' or 'hierarchical01'). Determines tightness +#' @param alphas values of alpha to be tried. Only used for clusterFunctions of +#' type '01' (either 'tight' or 'hierarchical01'). Determines tightness #' required in creating clusters from the dissimilarity matrix. Takes on #' values in [0,1]. See \code{\link{clusterD}}. #' @param betas values of \code{beta} to be tried in sequential steps. Only used #' for \code{sequential=TRUE}. Determines the similarity between two clusters #' required in order to deem the cluster stable. Takes on values in [0,1]. See #' \code{\link{seqCluster}}. -#' @param clusterFunction function used for the clustering. Note that unlike in +#' @param clusterFunction function used for the clustering. Note that unlike in #' \code{\link{clusterSingle}}, this must be a character vector of pre-defined #' clustering techniques provided by \code{\link{clusterSingle}}, and can not #' be a user-defined function. Current functions are "tight", @@ -27,15 +27,15 @@ #' @param minSizes the minimimum size required for a cluster (in #' \code{clusterD}). Clusters smaller than this are not kept and samples are #' left unassigned. -#' @param distFunction a vector of character strings that are the names of +#' @param distFunction a vector of character strings that are the names of #' distance functions found in the global environment. See the help pages of -#' \code{\link{clusterD}} for details about the required format of distance -#' functions. Currently, this distance function must be applicable for all +#' \code{\link{clusterD}} for details about the required format of distance +#' functions. Currently, this distance function must be applicable for all #' clusterFunction types tried. Therefore, it is not possible to intermix type "K" #' and type "01" algorithms if you also give distances to evaluate via #' \code{distFunction} unless all distances give 0-1 values for the distance #' (and hence are possible for both type "01" and "K" algorithms). -#' @param nVarDims vector of the number of the most variable features to keep +#' @param nVarDims vector of the number of the most variable features to keep #' (when "var", "cv", or "mad" is identified in \code{dimReduce}). If NA is #' included, then the full dataset will also be included. #' @param nPCADims vector of the number of PCs to use (when 'PCA' is identified @@ -50,7 +50,7 @@ #' @inheritParams clusterSingle #' @inheritParams clusterD #' @param ncores the number of threads -#' @param random.seed a value to set seed before each run of clusterSingle (so +#' @param random.seed a value to set seed before each run of clusterSingle (so #' that all of the runs are run on the same subsample of the data). Note, if #' 'random.seed' is set, argument 'ncores' should NOT be passed via #' subsampleArgs; instead set the argument 'ncores' of @@ -170,7 +170,7 @@ # clSmaller <- clusterMany(simData, nPCADims=c(5,10,50), dimReduce="PCA", # paramMatrix=checkParamsMat, subsampleArgs=checkParams$subsampleArgs, # seqArgs=checkParams$seqArgs, clusterDArgs=checkParams$clusterDArgs) - +#' @export setMethod( f = "clusterMany", signature = signature(x = "matrix"), @@ -179,6 +179,7 @@ setMethod( transFun=NULL,isCount=FALSE, ... ){ + if(any(dim(x)==0)) stop("x must have non zero dimensions") origX <- x transObj <- .transData(x, nPCADims=nPCADims, nVarDims=nVarDims, @@ -249,7 +250,7 @@ setMethod( dataList<-data dataName <- names(dataList) if(is.null(paramMatrix)){ - param <- expand.grid(dataset=dataName, + param <- expand.grid(dataset=dataName, k=ks, alpha=alphas, findBestK=findBestK, beta=betas, minSize=minSizes, sequential=sequential, distFunction=distFunction, removeSil=removeSil, subsample=subsample, @@ -264,7 +265,7 @@ setMethod( if(length(typeK)>0){ param[typeK,"alpha"] <- NA #just a nothing value, because doesn't mean anything here #param[typeK,"beta"] <- NA #just a nothing value, because doesn't mean anything here - + #if findBestK make sure other arguments make sense: whFindBestK <- which(param[,"findBestK"]) if(length(whFindBestK)>0){ @@ -308,7 +309,7 @@ setMethod( warning("beta value must be in (0,1). Input betas outside that range are ignored") param[beta01,"beta"]<-NA } - + param <- unique(param) ##### @@ -331,7 +332,7 @@ setMethod( param<-param[-whInvalid,] } - #if type K and not findBestK, need to give the k value. + #if type K and not findBestK, need to give the k value. whInvalid <- which(is.na(param[,"k"]) & !param[,"findBestK"] & param[,"clusterFunction"] %in% c("pam","hierarchicalK") ) if(length(whInvalid)>0){ param<-param[-whInvalid,] @@ -385,7 +386,7 @@ setMethod( subsample <- as.logical(gsub(" ","",par["subsample"])) findBestK <- as.logical(gsub(" ","",par["findBestK"])) clusterFunction <- as.character(par[["clusterFunction"]]) - distFunction<-if(!is.na(par[["distFunction"]])) as.character(par[["distFunction"]]) else distFunction<-NULL + distFunction<-if(!is.na(par[["distFunction"]])) as.character(par[["distFunction"]]) else NULL if(!is.na(par[["k"]])){ if(sequential) { seqArgs[["k0"]] <- par[["k"]] @@ -434,12 +435,13 @@ setMethod( return(distMat) }) names(allDist)<-paste(distParam[,"dataset"],distParam[,"distFunction"],sep="--") - + } - + if(verbose) { cat("Running Clustering on Parameter Combinations...") } + #browser() if(ncores>1) { out <- mclapply(1:nrow(param), FUN=paramFun, mc.cores=ncores, ...) nErrors <- which(sapply(out, function(x){inherits(x, "try-error")})) @@ -490,8 +492,8 @@ setMethod( #outval<-.addBackSEInfo(newObj=outval,oldObj=x) #added to '.addNewResult' ##Check if clusterMany already ran previously x<-.updateCurrentWorkflow(x,eraseOld,"clusterMany") - - if(!is.null(x)) retval<-.addNewResult(newObj=outval,oldObj=x) #make decisions about what to keep. + + if(!is.null(x)) retval<-.addNewResult(newObj=outval,oldObj=x) #make decisions about what to keep. else retval<-.addBackSEInfo(newObj=outval,oldObj=x) validObject(retval) return(retval) @@ -523,6 +525,13 @@ setMethod( } } ) +#' @export +#' @rdname clusterMany +setMethod( +f = "clusterMany", +signature = signature(x = "data.frame"), +definition = function(x,...){clusterMany(data.matrix(x),...)} +) diff --git a/R/getFeatures.R b/R/getFeatures.R index d17eb3be..d71adac1 100644 --- a/R/getFeatures.R +++ b/R/getFeatures.R @@ -238,7 +238,7 @@ setMethod(f = "getBestFeatures", if(is.null(x@dendro_clusters)) { stop("If `contrastType='Dendro'`, `makeDendrogram` must be run before `getBestFeatures`") } else { - if(primaryClusterIndex(x)!= x@dendro_index) stop("Primary cluster does not match the cluster on which the dendrogram was made. Either replace existing dendrogram with on using the primary cluster (via 'makeDendrogram'), or reset primaryCluster with 'primaryClusterIndex' to be equal to index of 'dendo_index' slot") + if(primaryClusterIndex(x)!= dendroClusterIndex(x)) stop("Primary cluster does not match the cluster on which the dendrogram was made. Either replace existing dendrogram with on using the primary cluster (via 'makeDendrogram'), or reset primaryCluster with 'primaryClusterIndex' to be equal to index of 'dendo_index' slot") else dendro <- x@dendro_clusters } } diff --git a/R/internalFunctions.R b/R/internalFunctions.R index 19d6d037..107ea3a6 100644 --- a/R/internalFunctions.R +++ b/R/internalFunctions.R @@ -29,6 +29,7 @@ if(is.na(retval@dendro_index) & !is.na(oldObj@dendro_index)){ retval@dendro_samples<-oldObj@dendro_samples retval@dendro_clusters<-oldObj@dendro_clusters + retval@dendro_outbranch<-oldObj@dendro_outbranch retval@dendro_index<-oldObj@dendro_index+nClusters(newObj) #update index to where dendrogram from } #put back orderSamples, coClustering @@ -48,8 +49,11 @@ orderSamples=orderSamples(newObj), coClustering=coClustering(newObj), dendro_samples=newObj@dendro_samples, + dendro_outbranch=newObj@dendro_outbranch, dendro_clusters=newObj@dendro_clusters, - dendro_index=newObj@dendro_index) + dendro_index=newObj@dendro_index, + primaryIndex=primaryClusterIndex(newObj) + ) clusterLegend(retval)<-clusterLegend(newObj) return(retval) } @@ -110,6 +114,7 @@ } .convertToNum<-function(x){ + nms<-names(x) if(is.factor(x)){ x<-as.character(x) } @@ -120,9 +125,10 @@ if(inherits(test,"try-error")) x<-as.numeric(factor(x)) else x<-test options(op) - return(x) + } - else return(x) + names(x)<-nms + return(x) } ##Universal way to convert matrix of clusters (of any value) into integers, preserving -1, -2 values .makeIntegerClusters<-function(clMat){ @@ -194,9 +200,9 @@ return(list(colorList=colorList,convertedToColor=colorMat,numClusters=clMat)) } -##Universal way to change character indication of clusterTypes into indices. +##Universal way to change character indication of clusterTypes into integer indices. .TypeIntoIndices<-function(x,whClusters){ - test<-try(match.arg(whClusters[1],c("workflow","all","none","primaryCluster")),silent=TRUE) + test<-try(match.arg(whClusters[1],c("workflow","all","none","primaryCluster","dendro")),silent=TRUE) if(!inherits(test,"try-error")){ if(test=="workflow"){ ppIndex<-workflowClusterDetails(x) @@ -214,6 +220,10 @@ } if(test=="none") wh<-vector("integer",length=0) if(test=="primaryCluster") wh<-primaryClusterIndex(x) + if(test=="dendro"){ + wh<-dendroClusterIndex(x) + if(is.na(wh)) wh<-vector("integer",length=0) + } } else{ #first match to clusterTypes @@ -230,17 +240,6 @@ #browser() if(all(is.na(totalMatch))) wh<-vector("integer",length=0) else wh<-na.omit(totalMatch) #silently ignore things that don't match. -# -# if(!any(whClusters %in% clusterTypes(x))){ -# if(!any(whClusters %in% clusterLabels(x))) wh<-vector("integer",length=0) -# else{ -# wh<-which(clusterLabels(x) %in% whClusters) -# } -# } -# else{ -# #if(!all(whClusters %in% clusterTypes(x))) warning("not all indicated clusters match a clusterTypes") -# wh<-which(clusterTypes(x) %in% whClusters) -# } } return(wh) } @@ -272,9 +271,10 @@ return(clust.id) } + #### #Convert to object used by phylobase so can navigate easily -.makePhylobaseTree<-function(x,type){ +.makePhylobaseTree<-function(x,type,isSamples=FALSE,outbranch=FALSE){ type<-match.arg(type,c("hclust","dendro")) if(type=="hclust"){ #first into phylo from ape package @@ -287,8 +287,54 @@ } phylo4Obj<-try(as(tempPhylo,"phylo4"),FALSE) if(inherits(phylo4Obj, "try-error")) stop("the internally created phylo object cannot be converted to a phylo4 class. Check that you gave simple hierarchy of clusters, and not one with fake data per sample") - phylobase::nodeLabels(phylo4Obj)<-paste("Node",1:phylobase::nNodes(phylo4Obj),sep="") - return(phylo4Obj) + #browser() + if(isSamples){ + #NOTE: clusterNodes are found by those with non-zero edge-length between them and their decendents + nonZeroEdges<-phylobase::edgeLength(phylo4Obj)[which(phylobase::edgeLength(phylo4Obj)>0)] #doesn't include root + trueInternal<-sort(unique(as.numeric(sapply(strsplit(names(nonZeroEdges),"-"),.subset2,1)))) #this also picks up the outbranch between -1,-2 + #old way of doing it: + #clusterNodes<-sort(unique(unlist(phylobase::ancestors(phylo4Obj,node=phylobase::getNode(phylo4Obj,type="tip"),type="parent"),recursive=FALSE,use.names=FALSE))) + if(outbranch){#remove root from labeling if -1 outbranch + ####### + #remove root + ####### + rootNode<-phylobase::rootNode(phylo4Obj) + trueInternal<-trueInternal[!trueInternal%in%rootNode] + + ####### + #find the -1/-2 internal node (if it exists) + #determine it as the one without 0-length tip edges. + ####### + rootChild<-phylobase::descendants(phylo4Obj,node=rootNode,type="children") + #find tip descendants of these: + rootChildDesc<-lapply(rootChild,phylobase::descendants,phy=phylo4Obj,type="tip") + rootChildLeng<-lapply(rootChildDesc,phylobase::edgeLength,x=phylo4Obj) + rootChildNum<-sapply(rootChildLeng,min) + outbranchNode<-rootChild[rootChildNum>0] + + if(outbranchNode %in% trueInternal){ + outbranchIsInternal<-TRUE + outbranchNodeDesc<-phylobase::descendants(phylo4Obj,node=outbranchNode,type="ALL") #includes itself + trueInternal<-trueInternal[!trueInternal%in%outbranchNodeDesc] + outbranchNodeDesc<-outbranchNodeDesc[outbranchNodeDesc %in% phylobase::getNode(phylo4Obj,type="internal")] + } + else outbranchIsInternal<-FALSE + + } + #trueInternal<-allInternal[!allInternal%in%clusterNodes] + + phylobase::nodeLabels(phylo4Obj)[as.character(trueInternal)]<-paste("Node",1:length(trueInternal),sep="") + #add new label for root + if(outbranch){ + phylobase::nodeLabels(phylo4Obj)[as.character(rootNode)]<-"Root" + if(outbranchIsInternal) phylobase::nodeLabels(phylo4Obj)[as.character(outbranchNodeDesc)]<-paste("MissingNode",1:length(outbranchNodeDesc),sep="") + } + } + else phylobase::nodeLabels(phylo4Obj)<-paste("Node",1:phylobase::nNodes(phylo4Obj),sep="") + + return(phylo4Obj) } +# clTree<-.makePhylobaseTree(clustWithDendro@dendro_clusters,"dendro") +# sampTree<-.makePhylobaseTree(clustWithDendro@dendro_samples,"dendro",isSamples=TRUE,outbranch=FALSE) diff --git a/R/makeDendrogram.R b/R/makeDendrogram.R index 57a5e81f..38ded715 100644 --- a/R/makeDendrogram.R +++ b/R/makeDendrogram.R @@ -1,23 +1,23 @@ #' @title Make hierarchy of set of clusters -#' -#' @description Makes a dendrogram of a set of clusters based on hclust on the -#' medoids of the cluster. -#' -#' @aliases makeDendrogram -#' -#' @param x data to define the medoids from. Matrix and -#' \code{\link{ClusterExperiment}} supported. -#' @param cluster A numeric vector with cluster assignments. If x is a -#' ClusterExperiment object, cluster is automatically the primaryCluster(x). -#' ``-1'' indicates the sample was not assigned to a cluster. -#' @param unassignedSamples how to handle unassigned samples("-1") ; only relevant -#' for sample clustering. See details. -#' @param whichCluster an integer index or character string that identifies -#' which cluster should be used to make the dendrogram. Default is +#' +#' @aliases makeDendrogram,ClusterExperiment-method + +#' @description Makes a dendrogram of a set of clusters based on hclust on the +#' medoids of the cluster. +#' @param x data to define the medoids from. Matrix and +#' \code{\link{ClusterExperiment}} supported. +#' @param cluster A numeric vector with cluster assignments. If x is a +#' ClusterExperiment object, cluster is automatically the primaryCluster(x). +#' ``-1'' indicates the sample was not assigned to a cluster. +#' @param unassignedSamples how to handle unassigned samples("-1") ; only +#' relevant for sample clustering. See details. +#' @param whichCluster an integer index or character string that identifies +#' which cluster should be used to make the dendrogram. Default is #' primaryCluster. -#' @param ... for makeDendrogram, if signature \code{matrix}, arguments passed -#' to hclust; if signature \code{ClusterExperiment} passed to the method for -#' signature \code{matrix}. For plotDendrogram, passed to \code{\link{plot.dendrogram}}. +#' @param ... for makeDendrogram, if signature \code{matrix}, arguments passed +#' to hclust; if signature \code{ClusterExperiment} passed to the method for +#' signature \code{matrix}. For plotDendrogram, passed to +#' \code{\link{plot.dendrogram}}. #' @inheritParams clusterSingle #' @inheritParams transform #' @details The function returns two dendrograms (as a list if x is a matrix or @@ -55,8 +55,9 @@ #' #create dendrogram of clusters: #' hcl <- makeDendrogram(cl) #' plotDendrogram(hcl) -#' plotDendrogram(hcl, leaves="samples") +#' plotDendrogram(hcl, leafType="samples",labelType="colorblock") #' +#' @name makeDendrogram #' @rdname makeDendrogram setMethod( f = "makeDendrogram", @@ -70,6 +71,7 @@ setMethod( if(!whCl %in% 1:nClusters(x)) stop("Invalid value for 'whichCluster'. Must be integer between 1 and ", nClusters(x)) # browser() cl<-clusterMatrix(x)[,whCl] + #cl<-convertClusterLegend(x,output="matrixNames")[,whCl] ######## ##Transform the data ######## @@ -83,17 +85,19 @@ setMethod( origX <- assay(x) nPCADims <- ifelse(dimReduce=="PCA", ndims, NA) nVarDims <- ifelse(dimReduce=="var", ndims, NA) - dimReduceCl<-if(ignoreUnassignedVar) cl else NULL #if else doesn't work with NULL + dimReduceCl<-if(ignoreUnassignedVar) cl else NULL #ifelse doesn't work with NULL transObj <- .transData(origX, nPCADims=nPCADims, nVarDims=nVarDims, dimReduce=dimReduce, transFun=transformation(x),clustering=dimReduceCl) dat <- transObj$x - if(is.null(dim(dat)) || NCOL(dat) != NCOL(origX)) { + if(is.null(dim(dat)) || NCOL(dat) != NCOL(origX)) { stop("Error in the internal transformation of x") } outlist <- makeDendrogram(x=dat, cluster=cl,unassignedSamples=unassignedSamples, ...) x@dendro_samples <- outlist$samples x@dendro_clusters <- outlist$clusters x@dendro_index<-whCl + #browser() + x@dendro_outbranch<- any(cl<0) & unassignedSamples=="outgroup" validObject(x) return(x) }) @@ -115,22 +119,25 @@ setMethod( if(is.null(colnames(x))) { colnames(x) <- as.character(1:ncol(x)) } - if(is.factor(cl)) { - warning("cluster is a factor. Converting to numeric, which may not result in valid conversion") - cl<-as.numeric(as.character(cl)) - } + clNum<-.convertToNum(cl) + + # if(is.factor(cl)) { + # warning("cluster is a factor. Converting to numeric, which may not result in valid conversion") + # cl<-as.numeric(as.character(cl)) + # } #dat <- t(x) #make like was in old code ############# # Cluster dendrogram ############# - whRm <- which(cl >= 0) #remove -1, -2 + whRm <- which(clNum >= 0) #remove -1, -2 if(length(whRm) == 0) stop("all samples have clusterIds<0") if(length(unique(cl[whRm]))==1) stop("Only 1 cluster given. Can not make a dendrogram.") - clFactor <- factor(cl[whRm]) + clFactor <- factor(cl[whRm]) medoids <- do.call("rbind", by(t(x[,whRm]), clFactor, function(z){apply(z, 2, median)})) rownames(medoids) <- levels(clFactor) nPerCluster <- table(clFactor) + #browser() clusterD<-as.dendrogram(stats::hclust(dist(medoids)^2,members=nPerCluster,...)) ############# @@ -247,10 +254,13 @@ setMethod( #add remaining to fake data and let them cluster fakeData <- rbind(fakeData,dat[-whRm,,drop=FALSE]) fakeData <- fakeData[rownames(dat),,drop=FALSE] - return(as.dendrogram(stats::hclust(dist(fakeData)))) + #return(as.dendrogram(stats::hclust(dist(fakeData)))) } } - fullD <- as.dendrogram(stats::hclust(dist(fakeData)^2), ...) +# browser() + #make sure fakeData in same order as original data so order.dendrogram will work + fakeData<-fakeData[na.omit(match(rownames(dat),rownames(fakeData))),] + fullD <- as.dendrogram(stats::hclust(dist(fakeData)^2), ...) if(length(whRm) != nrow(dat) && unassigned == "outgroup"){ #need to get rid of super long outgroup arm armLength <- max(attributes(fullD[[1]])$height, @@ -264,67 +274,4 @@ setMethod( -#' @rdname makeDendrogram -#' @export -#' @param leaves if "samples" the dendrogram has one leaf per sample, otherwise -#' it has one per cluster. -#' @param main passed to the \code{plot} function. -#' @param sub passed to the \code{plot} function. -#' @param clusterNames logical. If \code{leaves="clusters"}, then clusters will -#' be identified with their 'name' value in legend; otherwise the 'clusterIds' -#' value will be used. -#' @aliases plotDendrogram -#' @details If \code{leaves="clusters"}, the plotting function will work best if -#' the clusters in the dendrogram correspond to the primary cluster. This is -#' because the function colors the cluster labels based on the colors of the -#' clusterIds of the primaryCluster -#' @importFrom ape plot.phylo -setMethod( - f = "plotDendrogram", - signature = "ClusterExperiment", - definition = function(x,leaves=c("clusters","samples" ), clusterNames=TRUE, - main,sub,...) - { - leaves<-match.arg(leaves) - if(missing(main)) main<-ifelse(leaves=="samples","Dendrogram of samples", "Dendrogram of clusters") - if(is.null(x@dendro_samples) || is.null(x@dendro_clusters)) stop("No dendrogram is found for this ClusterExperiment Object. Run makeDendrogram first.") - if(missing(sub)) sub<-paste("Dendrogram made with '",clusterLabels(x)[x@dendro_index],"', cluster index ",x@dendro_index,sep="") - dend<- switch(leaves,"samples"=x@dendro_samples,"clusters"=x@dendro_clusters) - phylo4Obj <- .makePhylobaseTree(dend, "dendro") - phyloObj <- as(phylo4Obj, "phylo") - leg<-clusterLegend(x)[[x@dendro_index]] - if(leaves=="clusters"){ - m<-match(phyloObj$tip.label,leg[,"clusterIds"]) - if(any(is.na(m))) stop("clusterIds do not match dendrogram labels") - phyloObj$tip.label<-leg[m,"name"] - tip.color<-leg[m,"color"] - - } - else{ - cl<-clusterMatrix(x)[,x@dendro_index] - m<-match(cl,leg[,"clusterIds"]) - tip.color<-leg[m,"color"] - } - #browser() - if(max(phyloObj$edge.length)>1e6) phyloObj$edge.length<-phyloObj$edge.length/max(phyloObj$edge.length) #otherwise get error - ape::plot.phylo(phyloObj, tip.color=tip.color,...) - invisible(phyloObj) - # labs<-labels(dend) - # m<-match(labs,leg[,"clusterIds"]) -# if(any(is.na(m))) warning("Dendrogram labels do not all match clusterIds of primaryCluster. Dendrogram was not created with current primary cluster, so cannot retreive cluster name or color") -# else{ -# #function to change to labels and colors of a node: -# reLabel <- function(n) { -# if(is.leaf(n)) { -# a <- attributes(n) -# m<-match(a$label,leg[,"clusterIds"]) -# if(clusterNames) attr(n, "label") <- leg[m,"name"] # change the node label -# attr(n, "nodePar") <- c(a$nodePar, list(lab.col = leg[m,"color"],col=leg[m,"color"],pch=19)) # change the node color -# } -# return(n) -# } -# dend <- dendrapply(dend, reLabel) -# } -# } -# plot(dend,main=main,sub=sub,...) - }) + diff --git a/R/mergeClusters.R b/R/mergeClusters.R index 1935a616..48df445a 100644 --- a/R/mergeClusters.R +++ b/R/mergeClusters.R @@ -1,95 +1,118 @@ +.availMergeMethods<-c("adjP", "locfdr", "MB", "JC") #' @title Merge clusters based on dendrogram -#' -#' @description Takes an input of hierarchical clusterings of clusters and -#' returns estimates of number of proportion of non-null and merges those +#' +#' @description Takes an input of hierarchical clusterings of clusters and +#' returns estimates of number of proportion of non-null and merges those #' below a certain cutoff. -#' +#' #' @aliases mergeClusters -#' -#' @param x data to perform the test on. It can be a matrix or a +#' +#' @param x data to perform the test on. It can be a matrix or a #' \code{\link{ClusterExperiment}}. -#' @param cl A numeric vector with cluster assignments to compare to. ``-1'' +#' @param cl A numeric vector with cluster assignments to compare to. ``-1'' #' indicates the sample was not assigned to a cluster. -#' @param dendro dendrogram providing hierarchical clustering of clusters in cl; -#' The default for matrix (NULL) is to recalculate it with the given (x, cl) -#' pair. If x is a \code{\link{ClusterExperiment}} object, the dendrogram in -#' the slot \code{dendro_clusters} will be used. This means that -#' \code{\link{makeDendrogram}} needs to be called before -#' \code{mergeClusters}. +#' @param dendro dendrogram providing hierarchical clustering of clusters in cl. +#' If x is a matrix, then the default is \code{dendro=NULL} and the function +#' will calculate the dendrogram with the given (x, cl) pair using +#' \code{\link{makeDendrogram}}. If x is a \code{\link{ClusterExperiment}} +#' object, the dendrogram in the slot \code{dendro_clusters} will be used. In +#' this case, this means that \code{\link{makeDendrogram}} needs to be called +#' before \code{mergeClusters}. #' @param mergeMethod method for calculating proportion of non-null that will be -#' used to merge clusters (if 'none', no merging will be done). See details +#' used to merge clusters (if 'none', no merging will be done). See details #' for description of methods. -#' @param cutoff minimimum value required for NOT merging a cluster, i.e. -#' two clusters with the proportion of DE below cutoff will be merged. -#' Must be a value between 0, 1, where -#' lower values will make it harder to merge clusters. -#' @param plotType what type of plotting of dendrogram. If 'all', then all the -#' estimates of proportion non-null will be plotted; if 'mergeMethod', then -#' only the value used in the merging is plotted for each node. -#' @param isCount logical as to whether input data is a count matrix. See details. -#' @param doPlot logical as to whether to plot the dendrogram (overrides -#' \code{plotType} value). Mainly used for internal coding purposes. -#' @param ... for signature \code{matrix}, arguments passed to the -#' \code{\link{plot.phylo}} function of \code{ade4} that plots the dendrogram. -#' For signature \code{ClusterExperiment} arguments passed to the method for -#' signature \code{matrix}. +#' @param cutoff minimimum value required for NOT merging a cluster, i.e. two +#' clusters with the proportion of DE below cutoff will be merged. Must be a +#' value between 0, 1, where lower values will make it harder to merge +#' clusters. +#' @param plotInfo what type of information about the merging will be shown on +#' the dendrogram. If 'all', then all the estimates of proportion non-null +#' will be plotted at each node of the dendrogram; if 'mergeMethod', then only +#' the value used in the merging is plotted at each node. If 'none', then no +#' proportions will be added to the dendrogram. 'plotInfo' can also be one of +#' the mergeMethod choices (even if that method is not the method chosen in +#' 'mergeMethod' options). +#' @param isCount logical as to whether input data is a count matrix. See +#' details. +#' @param plot logical as to whether to plot the dendrogram with the merge +#' results +#' @param ... for signature \code{matrix}, arguments passed to the +#' \code{\link{plot.phylo}} function of \code{ape} that plots the dendrogram. +#' For signature \code{ClusterExperiment} arguments passed to the method for +#' signature \code{matrix} and then onto \code{\link{plot.phylo}}. #' @inheritParams clusterMany,matrix-method -#' -#' @details If \code{isCount=TRUE}, and the input is a matrix, -#' \code{log2(count + 1)} will be used for \code{\link{makeDendrogram}} and the -#' original data with voom correction will be used in -#' \code{\link{getBestFeatures}}). If input is -#' \code{\link{ClusterExperiment}}, then setting \code{isCount=TRUE} also means -#' that the log2(1+count) will be used as the transformation, like for -#' the matrix case as well as the voom calculation, and will NOT use the -#' transformation stored in the object. If FALSE, then transform(x) will be -#' given to the input and will be used for both \code{makeDendrogram} and +#' +#' @details If \code{isCount=TRUE}, and the input is a matrix, \code{log2(count +#' + 1)} will be used for \code{\link{makeDendrogram}} and the original data +#' with voom correction will be used in \code{\link{getBestFeatures}}). If +#' input is \code{\link{ClusterExperiment}}, then setting \code{isCount=TRUE} +#' also means that the log2(1+count) will be used as the transformation, like +#' for the matrix case as well as the voom calculation, and will NOT use the +#' transformation stored in the object. If FALSE, then transform(x) will be +#' given to the input and will be used for both \code{makeDendrogram} and #' \code{getBestFeatures}, with no voom correction. -#' @details "JC" refers to the method of Ji and Cai (2007), and implementation -#' of "JC" method is copied from code available on Jiashin Ji's website, -#' December 16, 2015 +#' @details "JC" refers to the method of Ji and Cai (2007), and implementation +#' of "JC" method is copied from code available on Jiashin Ji's website, +#' December 16, 2015 #' (http://www.stat.cmu.edu/~jiashun/Research/software/NullandProp/). "locfdr" -#' refers to the method of Efron (2004) and is implemented in the package +#' refers to the method of Efron (2004) and is implemented in the package #' \code{\link{locfdr}}. "MB" refers to the method of Meinshausen and Buhlmann -#' (2005) and is implemented in the package \code{\link{howmany}}. "adjP" +#' (2005) and is implemented in the package \code{\link{howmany}}. "adjP" #' refers to the proportion of genes that are found significant based on a FDR #' adjusted p-values (method "BH") and a cutoff of 0.05. -#' -#' @details If \code{mergeMethod} is not equal to 'none' then the plotting will -#' indicate where the clusters will be merged (assuming \code{plotType} is not 'none'). -#' @return If `x` is a matrix, it returns (invisibly) a list with elements -#' \itemize{ \item{\code{clustering}}{ a vector of length equal to ncol(x) -#' giving the integer-valued cluster ids for each sample. "-1" indicates the -#' sample was not clustered.} \item{\code{oldClToNew}}{ A table of the old +#' +#' @details If \code{mergeMethod} is not equal to 'none' then the plotting will +#' indicate where the clusters will be merged (assuming \code{plotInfo} is not +#' 'none'). Note setting both 'mergeMethod' and 'plotInfo' to 'none' will +#' cause function to stop, because nothing is asked to be done. If you just +#' want plot of the dendrogram, with no merging performed or demonstrated on +#' the plot, see \code{\link{plotDendrogram}}. +#' @details If the dendrogram was made with option +#' \code{unassignedSamples="cluster"} (i.e. unassigned were clustered in with +#' other samples), then you cannot choose the option +#' \code{leafType='samples'}. This is because the current code cannot reliably +#' link up the internal nodes of the sample dendrogram to the internal nodes +#' of the cluster dendrogram when the unassigned samples are intermixed. +#' @return If `x` is a matrix, it returns (invisibly) a list with elements +#' \itemize{ \item{\code{clustering}}{ a vector of length equal to ncol(x) +#' giving the integer-valued cluster ids for each sample. "-1" indicates the +#' sample was not clustered.} \item{\code{oldClToNew}}{ A table of the old #' cluster labels to the new cluster labels.} \item{\code{propDE}}{ A table of -#' the proportions that are DE on each node.} -#' \item{\code{originalClusterDendro}}{ The dendrogram on which the merging +#' the proportions that are DE on each node.} +#' \item{\code{originalClusterDendro}}{ The dendrogram on which the merging #' was based (based on the original clustering).} } -#' @return If `x` is a \code{\link{ClusterExperiment}}, it returns a new -#' \code{ClusterExperiment} object with an additional clustering based on the +#' @return If `x` is a \code{\link{ClusterExperiment}}, it returns a new +#' \code{ClusterExperiment} object with an additional clustering based on the #' merging. This becomes the new primary clustering. +#' @seealso makeDendrogram, plotDendrogram, getBestFeatures #' @examples #' data(simData) -#' +#' #' #create a clustering, for 8 clusters (truth was 3) #' cl<-clusterSingle(simData, clusterFunction="pam", subsample=FALSE, #' sequential=FALSE, clusterDArgs=list(k=8)) -#' +#' +#' #give more interesting names to clusters: +#' newNames<- paste("Cluster",clusterLegend(cl)[[1]][,"name"],sep="") +#' clusterLegend(cl)[[1]][,"name"]<-newNames #' #make dendrogram #' cl <- makeDendrogram(cl) #' -#' #merge clusters with plotting. Note argument 'use.edge.length' can improve -#' #readability -#' merged <- mergeClusters(cl, plot=TRUE, plotType="all", +#' #plot showing the before and after clustering +#' #(Note argument 'use.edge.length' can improve +#' #readability) +#' merged <- mergeClusters(cl, plotInfo="all", #' mergeMethod="adjP", use.edge.length=FALSE) #' +#' #Simpler plot with just dendrogram and single method +#' merged <- mergeClusters(cl, plotInfo="mergeMethod", +#' mergeMethod="adjP", use.edge.length=FALSE, +#' leafType="clusters",label="name") +#' #' #compare merged to original #' table(primaryCluster(cl), primaryCluster(merged)) +#' #' @export -#' @importFrom phylobase labels descendants ancestors getNode -#' @importClassesFrom phylobase phylo4 -#' @importFrom graphics plot -#' @importFrom ape plot.phylo #' @importFrom howmany howmany lowerbound #' @importFrom locfdr locfdr #' @rdname mergeClusters @@ -97,9 +120,10 @@ setMethod(f = "mergeClusters", signature = signature(x = "matrix"), definition = function(x, cl, dendro=NULL, mergeMethod=c("none", "adjP", "locfdr", "MB", "JC"), - plotType=c("none", "all", "mergeMethod","adjP", "locfdr", "MB", "JC"), - cutoff=0.1, doPlot=TRUE, - isCount=TRUE, ...) { + plotInfo=c("none", "all", "mergeMethod","adjP", "locfdr", "MB", "JC"), + cutoff=0.1, plot=TRUE, + isCount=TRUE, ...) { + dendroSamples<-NULL #currently option is not implemented for matrix version... if(is.factor(cl)){ warning("cl is a factor. Converting to numeric, which may not result in valid conversion") cl <- .convertToNum(cl) @@ -112,21 +136,21 @@ setMethod(f = "mergeClusters", } } mergeMethod <- match.arg(mergeMethod) - plotType <- match.arg(plotType) - if(mergeMethod=="none" & plotType=="none") stop("mergeMethod and plotType both equal 'none'; nothing to be done.") - if(plotType=="mergeMethod" & mergeMethod=="none") { - stop("can only plot merge method values if one method is selected") + plotInfo <- match.arg(plotInfo) + if(mergeMethod=="none" & plotInfo=="none") stop("mergeMethod and plotInfo both equal 'none'; nothing to be done.") + if(plotInfo=="mergeMethod" & mergeMethod=="none") { + stop("can only plot 'mergeMethod' results if one method is selected") } #get test-statistics for the contrasts corresponding to each node (and return all) sigTable <- getBestFeatures(x, cl, contrastType=c("Dendro"), dendro=dendro, contrastAdj=c("All"), number=nrow(x), p.value=1, isCount=isCount) - +#browser() #divide table into each node. whMethodCalculate<-if(!mergeMethod=="none") mergeMethod else c() - if(plotType=="all") whMethodCalculate<-c("adjP", "locfdr", "MB", "JC") - if(plotType%in% c("adjP", "locfdr", "MB", "JC")) whMethodCalculate<-unique(c(whMethodCalculate,plotType)) + if(plotInfo=="all") whMethodCalculate<-.availMergeMethods + if(plotInfo%in% .availMergeMethods) whMethodCalculate<-unique(c(whMethodCalculate,plotInfo)) sigByNode <- by(sigTable, sigTable$ContrastName, function(x) { mb <-if("MB" %in% whMethodCalculate) .myTryFunc(pvalues=x$P.Value, FUN=.m1_MB) else NA locfdr <-if("locfdr" %in% whMethodCalculate) .myTryFunc(tstats=x$t, FUN=.m1_locfdr) else NA @@ -193,105 +217,68 @@ setMethod(f = "mergeClusters", else oldClToNew=table(Original=cl, New=newcl) out<-list(clustering=newcl, oldClToNew=oldClToNew, propDE=nodePropTable, originalClusterDendro=dendro,mergeMethod=mergeMethod) - if(doPlot) .plotMerge(dendro,mergeOutput=out,plotType=plotType,mergeMethod=mergeMethod,...) + if(plot){ + clMat<-cbind(Original=cl, mergeCluster=newcl) + if(!is.null(dendroSamples)){ + if(is.null(names(cl))){ + warning("dendroSamples argument will be ignored because cl does not have names to allow for linkage to the dendroSamples values") + dendroSamples<-NULL + } + else{ + rownames(clMat)<-names(cl) + } + } + #browser() + if(!is.null(dendroSamples)) .plotDendro(dendroSamples,leafType="samples",mergeOutput=out,mergePlotType=plotInfo,mergeMethod=mergeMethod,cl=cl,label="name",outbranch=any(cl<0),...) + else .plotDendro(dendro,leafType="clusters",mergeOutput=out,mergePlotType=plotInfo,mergeMethod=mergeMethod,cl=clMat,label="name",...) + + } invisible(out) } ) -.plotMerge<-function(dendro,mergeOutput,plotType,mergeMethod,clusterLegendMat=NULL,dendroSamples=NULL,...){ - sigInfo<-mergeOutput$propDE - whToMerge<-which(sigInfo$Merged) - nodesToMerge<-sigInfo$Node[whToMerge] - methods<-colnames(sigInfo[,-c(1:3)]) - if(plotType!="none"){ - # phylobase has bug in plotting! submitted to their github - # move to ape package... - phylo4Obj <- .makePhylobaseTree(dendro, "dendro") - phyloObj <- as(phylo4Obj, "phylo") - - ##### - #convert names of internal nodes for plotting - ##### - #match to order of tree - m <- match(phyloObj$node, sigInfo$Node) - edgeLty <- rep(1, nrow(phyloObj$edge)) - if(mergeMethod != "none" && length(whToMerge) > 0) { - whMerge <- which(phyloObj$node.label %in% nodesToMerge) #which of nodes merged - nodeNumbers <- (length(phyloObj$tip) + 1):max(phyloObj$edge) - whEdge <- which(phyloObj$edge[,1] %in% nodeNumbers[whMerge]) - edgeLty[whEdge] <- 2 - } - if(plotType == "mergeMethod"){ - if(!mergeMethod %in% methods) stop("mergeMethod not in methods of output") - phyloObj$node.label <- as.character(signif(sigInfo[m,mergeMethod],2)) - } - if(plotType %in% c("all","adjP", "locfdr", "MB", "JC")) { - meth<-if(plotType=="all") methods else methods[methods%in%plotType] - phyloObj$node.label <- apply(sigInfo[,meth,drop=FALSE],1, function(x){ - whKp<-which(!is.na(x)) - paste(paste(meth[whKp], signif(x[whKp],2), sep=":"), collapse=",\n")}) - - } - #browser() - ###Add color and name from the object. - #browser() - if(!is.null(clusterLegendMat)){ - m<-match(phyloObj$tip.label,clusterLegendMat[,"clusterIds"]) - if(any(is.na(m))) stop("clusterIds do not match dendrogram labels") - phyloObj$tip.label<-clusterLegendMat[m,"name"] - tip.color<-clusterLegendMat[m,"color"] - } - else tip.color<-"black" - if(max(phyloObj$edge.length)>1e6) phyloObj$edge.length<-phyloObj$edge.length/max(phyloObj$edge.length) #otherwise get error - - ape::plot.phylo(phyloObj, show.node=TRUE, edge.lty=edgeLty, tip.color=tip.color,...) - } -} -## If want to try to add plotCluster information, from example of phydataplot in ape package: -# ## use type = "mosaic" on a 30x5 matrix: -# tr <- rtree(n <- 30) -# p <- 5 -# x <- matrix(sample(3, size = n*p, replace = TRUE), n, p) -# dimnames(x) <- list(paste0("t", 1:n), LETTERS[1:p]) -# plot(tr, x.lim = 35, align.tip = TRUE, adj = 1) -# phydataplot(x, tr, "m", 2) -# ## change the aspect: -# plot(tr, x.lim = 35, align.tip = TRUE, adj = 1) -# phydataplot(x, tr, "m", 2, width = 2, border = "white", lwd = 3, legend = "side") -# ## user-defined colour: -# f <- function(n) c("yellow", "blue", "red") -# phydataplot(x, tr, "m", 18, width = 2, border = "white", lwd = 3, -# legend = "side", funcol = f) - #' @rdname mergeClusters #' @export -#' @param clusterLabel a string used to describe the type of clustering. By +#' @param clusterLabel a string used to describe the type of clustering. By #' default it is equal to "mergeClusters", to indicate that this clustering is -#' the result of a call to mergeClusters. +#' the result of a call to mergeClusters (only if x is a ClusterExperiment object) +#' @param labelType if plotting, then whether leaves of dendrogram should be +#' labeled by rectangular blocks of color ("colorblock") or with the names of +#' the leaves ("name") (only if x is a ClusterExperiment object). +#' @param leafType if plotting, whether the leaves should be the clusters or the +#' samples. Choosing 'samples' allows for visualization of how many samples +#' are in the merged clusters (only if x is a ClusterExperiment object), which +#' is the main difference between choosing "clusters" and "samples", +#' particularly if \code{labelType="colorblock"} setMethod(f = "mergeClusters", signature = signature(x = "ClusterExperiment"), definition = function(x, eraseOld=FALSE,isCount=FALSE, - mergeMethod="none",plotType="all",clusterLabel="mergeClusters",...) { - + mergeMethod="none",plotInfo="all",clusterLabel="mergeClusters",leafType=c("samples","clusters" ),labelType=c("colorblock","name","ids"),plot=TRUE,...) { + labelType<-match.arg(labelType) + leafType<-match.arg(leafType) if(is.null(x@dendro_clusters)) { stop("`makeDendrogram` needs to be called before `mergeClusters`") } else{ - cl<-clusterMatrix(x)[,x@dendro_index] - note("Merging will be done on '",clusterLabels(x)[x@dendro_index],"', with clustering index",x@dendro_index) + cl<-clusterMatrix(x)[,dendroClusterIndex(x)] + note("Merging will be done on '",clusterLabels(x)[dendroClusterIndex(x)],"', with clustering index",dendroClusterIndex(x)) } if(isCount) note("If `isCount=TRUE` the data will be transformed with voom() rather than with the transformation function in the slot `transformation`. This makes sense only for counts.") - #browser() + if(!x@dendro_outbranch){ + if(any(cl<0) & leafType=="samples"){ + warning("You cannot set 'leafType' to 'samples' in plotting mergeClusters unless the dendrogram was made with unassigned/missing (-1,-2) set to an outgroup (see makeDendrogram)") + leafType<-"clusters" + } + } + +###Note, plot=FALSE, and then manually call .plotDendro afterwards to allow for passage of colors, etc. outlist <- mergeClusters(x=if(!isCount) transform(x) else assay(x), cl=cl, - dendro=x@dendro_clusters, plotType=plotType,doPlot=FALSE, + dendro=x@dendro_clusters, plotInfo=plotInfo,plot=FALSE, isCount=isCount,mergeMethod=mergeMethod, ...) - if(plotType!="none"){ - .plotMerge(x@dendro_clusters,mergeOutput=outlist,plotType=plotType,mergeMethod=mergeMethod,clusterLegendMat=clusterLegend(x)[[x@dendro_index]]) - } if(mergeMethod!="none"){#only add a new cluster if there was a mergeMethod. otherwise, mergeClusters just returns original cluster! #---- @@ -307,14 +294,51 @@ This makes sense only for counts.") x<-.updateCurrentWorkflow(x,eraseOld,"mergeClusters") if(!is.null(x)) retval<-.addNewResult(newObj=newObj,oldObj=x) else retval<-.addBackSEInfo(newObj=newObj,oldObj=x) - invisible(retval) } else{ #don't do anything, since there was no merging done. - invisible(x) + retval<-x } + if(plot){ + dend<- switch(leafType,"samples"=retval@dendro_samples,"clusters"=retval@dendro_clusters) + # leg<-clusterLegend(retval)[[retval@dendro_index]] + # cl<-switch(leafType,"samples"=clusterMatrix(retval)[,retval@dendro_index],"clusters"=NULL) + #browser() + if(leafType=="samples" & mergeMethod!="none" & labelType=="colorblock"){ + whClusters<-c(retval@dendro_index,primaryClusterIndex(retval)) + leg<-clusterLegend(retval)[whClusters] + cl<-clusterMatrix(retval,whichClusters=whClusters) + rownames(cl)<-if(!is.null(colnames(retval))) colnames(retval) else as.character(1:ncol(retval)) + + } + else{ + leg<-clusterLegend(retval)[[retval@dendro_index]] + cl<-switch(leafType,"samples"=clusterMatrix(retval)[,retval@dendro_index],"clusters"=NULL) + if(leafType=="samples"){ + names(cl)<-if(!is.null(colnames(retval))) colnames(retval) else as.character(1:ncol(retval)) + } + + } + #browser() + if(labelType=="id") leg[,"name"]<-leg[,"clusterIds"] + label<-switch(labelType,"name"="name","colorblock"="colorblock","ids"="name") + outbranch<-FALSE + if(leafType=="samples" & any(cl<0)) outbranch<-retval@dendro_outbranch + #if(leafType=="samples" & any(cl<0)) outbranch<-TRUE + + # outbranch<-any(clusterMatrix(retval)[,retval@dendro_index]<0) + # cl<-clusterMatrix(retval,whichCluster=retval@dendro_index) + # rownames(cl)<-colnames(retval) + # dend<-ifelse(leafType=="samples", retval@dendro_samples,retval@dendro_clusters) + .plotDendro(dendro=dend,leafType=leafType,mergeOutput=outlist,mergePlotType=plotInfo,mergeMethod=mergeMethod,cl=cl,clusterLegendMat=leg,label=label,outbranch=outbranch,removeOutbranch=outbranch,legend="none") + } + + invisible(retval) } ) + + + .myTryFunc<-function(FUN,...){ x<-try(FUN(...),silent=TRUE) if(!inherits(x, "try-error")) return(x) diff --git a/R/plotDendrogram.R b/R/plotDendrogram.R new file mode 100644 index 00000000..5ea061d2 --- /dev/null +++ b/R/plotDendrogram.R @@ -0,0 +1,406 @@ +#' @title Plot dendrogram of clusterExperiment object +#' +#' @description Plots the dendrogram saved in a clusterExperiment object +#' +#' @param x a \code{\link{ClusterExperiment}} object. +#' @param leafType if "samples" the dendrogram has one leaf per sample, +#' otherwise it has one per cluster. +#' @param main passed to the \code{plot.phylo} function to set main title. +#' @param sub passed to the \code{plot.phylo} function to set subtitle. +#' @param labelType one of 'name', 'colorblock' or 'id'. If 'Name' then +#' dendrogram will be plotted, and name of cluster or sample (depending on +#' type of value for \code{leafType}) will be plotted next to the leaf of the +#' dendrogram. If 'colorblock', rectangular blocks, corresponding to the color +#' of the cluster will be plotted, along with cluster name legend. If 'id' the +#' internal clusterIds value will be plotted (only appropriate if +#' \code{leafType="clusters"}). +#' @param ... arguments passed to the \code{\link{plot.phylo}} function of +#' \code{ape} that plots the dendrogram. +#' @param whichClusters only used if \code{leafType="samples"}). If numeric, an +#' index for the clusterings to be plotted with dendrogram. Otherwise, +#' \code{whichClusters} can be a character value identifying the +#' \code{clusterTypes} to be used, or if not matching \code{clusterTypes} then +#' \code{clusterLabels}; alternatively \code{whichClusters} can be either +#' 'all' or 'workflow' or 'primaryCluster' to indicate choosing all clusters +#' or choosing all \code{\link{workflowClusters}}. Default 'dendro' indicates +#' using the clustering that created the dendrogram. +#' @param removeOutbranch logical, only applicable if there are missing samples +#' (i.e. equal to -1 or -2), \code{leafType="samples"} and the dendrogram +#' for the samples was made by putting missing samples in an outbranch. In +#' which case, if this parameter is TRUE, the outbranch will not be plotted, +#' and if FALSE it will be plotted. +#' @param legend logical, only applicable if \code{labelType="colorblock"}. +#' Passed to \code{\link{phydataplot}} in \code{\link{ape}} package that is +#' used to draw the color values of the clusters/samples next to the +#' dendrogram. Options are 'none', 'below', or 'side' +#' @aliases plotDendrogram +#' @details If \code{leafType="clusters"}, the plotting function will work best +#' if the clusters in the dendrogram correspond to the primary cluster. This +#' is because the function colors the cluster labels based on the colors of +#' the clusterIds of the primaryCluster +#' @importFrom ape plot.phylo +#' @export +#' +#' @examples +#' data(simData) +#' +#' #create a clustering, for 8 clusters (truth was 3) +#' cl <-clusterSingle(simData, clusterFunction="pam", subsample=FALSE, +#' sequential=FALSE, clusterDArgs=list(k=8)) +#' +#' #create dendrogram of clusters and then +#' # merge clusters based ondendrogram: +#' cl <- makeDendrogram(cl) +#' cl <- mergeClusters(cl,mergeMethod="adjP",cutoff=0.1,plot=FALSE) +#' plotDendrogram(cl) +#' plotDendrogram(cl,leafType="samples",whichClusters="all",labelType="colorblock") +#' +#' @export +#' @rdname plotDendrogram +setMethod( + f = "plotDendrogram", + signature = "ClusterExperiment", + definition = function(x,whichClusters="dendro",leafType=c("clusters","samples" ), labelType=c("name","colorblock","ids"), main,sub,removeOutbranch=TRUE,legend='side',...) + { + if(is.null(x@dendro_samples) || is.null(x@dendro_clusters)) stop("No dendrogram is found for this ClusterExperiment Object. Run makeDendrogram first.") + leafType<-match.arg(leafType) + labelType<-match.arg(labelType) + whCl<-.TypeIntoIndices(x,whClusters=whichClusters) + if(length(whCl)==0) stop("given whichClusters value does not match any clusters") + + if(missing(main)) main<-ifelse(leafType=="samples","Dendrogram of samples", "Dendrogram of clusters") + if(missing(sub)) sub<-paste("Dendrogram made with '",clusterLabels(x)[dendroClusterIndex(x)],"', cluster index ",dendroClusterIndex(x),sep="") + dend<- switch(leafType,"samples"=x@dendro_samples,"clusters"=x@dendro_clusters) + + cl<-switch(leafType,"samples"=clusterMatrix(x)[,whCl,drop=FALSE],"clusters"=NULL) + if(leafType=="samples") rownames(cl)<-if(!is.null(colnames(x))) colnames(x) else as.character(1:ncol(x)) + if(length(whCl)==1){ + leg<-clusterLegend(x)[[whCl]] + if(labelType=="id") leg[,"name"]<-leg[,"clusterIds"] + } + else{ + leg<-clusterLegend(x)[whCl] + if(labelType=="id") leg<-lapply(leg,function(x){x[,"name"]<-x[,"clusterIds"]; return(x)}) + } + label<-switch(labelType,"name"="name","colorblock"="colorblock","ids"="name") + invisible(.plotDendro(dendro=dend,leafType=leafType,mergeMethod=NULL,mergeOutput=NULL,clusterLegendMat=leg,cl=cl,label=label,outbranch=x@dendro_outbranch,main=main,sub=sub,removeOutbranch=removeOutbranch,legend=legend,...)) + + }) + + + + +######## +# Internal plotting function used by both mergeClusters and plotDendrogram +#' @importFrom phylobase labels descendants ancestors getNode edgeLength rootNode nodeLabels nNodes subset +#' @importClassesFrom phylobase phylo4 +#' @importFrom graphics plot +#' @importFrom ape plot.phylo phydataplot +.plotDendro<-function(dendro,leafType="clusters",mergePlotType=NULL,mergeMethod=NULL,mergeOutput=NULL,clusterLegendMat=NULL,cl=NULL,label=c("name","colorblock"),outbranch=FALSE,removeOutbranch=FALSE,legend="below",...){ + label<-match.arg(label) + phylo4Obj <- .makePhylobaseTree(dendro, "dendro",isSamples=(leafType=="samples"),outbranch=outbranch) + #--- + #remove the outbranch from the dendrogram and from cl + #(note this is using phylo4 obj) + #--- + if(outbranch & removeOutbranch & leafType=="samples"){ + rootNode<-phylobase::rootNode(phylo4Obj) + rootChild<-phylobase::descendants(phylo4Obj,node=rootNode,type="children") + tips<-phylobase::getNode(phylo4Obj,type="tip") + whMissingNode<-grep("MissingNode",names(rootChild)) + if(length(whMissingNode)==0){ + #check not a single -1 sample from root: + if(any(rootChild %in% tips)){ + #which ever rootChild is in tips must be single missing sample + #because can't make dendrogram with only 1 cluster so couldn't run plot or mergeClusters. + clusterNode<-rootChild[!rootChild %in% tips] + #stop("Internal coding error: need to fix .plotDendro to deal with when single missing sample") + } + else stop("Internal coding error: no outbranch nodes") + } + else clusterNode<-rootChild[-whMissingNode] + if(length(clusterNode)!=1) stop("Internal coding error: removing missing node does not leave exactly 1 descendent of root") + clusterTips<-phylobase::descendants(phylo4Obj,node=clusterNode,type="tip") + if(length(clusterTips)==0) stop("Internal coding error: no none missing samples in tree") + namesClusterTips<-names(clusterTips) + if(is.matrix(cl)) cl<-cl[namesClusterTips,] else cl<-cl[namesClusterTips] + phylo4Obj<-phylobase::subset(phylo4Obj, node.subtree=clusterNode) + #set outbranch=FALSE because now doesn't exist in tree... + outbranch<-FALSE + } + phyloObj <- as(phylo4Obj, "phylo") + + plotArgs<-list(...) + dataPct<-0.5 + offsetDivide<-16 + if(label=="colorblock" && is.null(cl) && leafType=="samples") stop("Internal coding error: must provide a clustering if label='colorblock'") + ############### + ### For plotting of dendrogram for the merging + ### Add information about the merging as node labels and change edge type + ############### + if(!is.null(mergePlotType) && mergePlotType %in% c("all","adjP", "locfdr", "MB", "JC","mergeMethod")){ + ##### + #convert names of internal nodes for plotting + ##### + #match to order of tree + #browser() + sigInfo<-mergeOutput$propDE + whToMerge<-which(sigInfo$Merged) + nodesToMerge<-as.character(sigInfo$Node[whToMerge]) + methods<-colnames(sigInfo[,-c(1:3)]) + m <- match( as.character(sigInfo$Node),phyloObj$node) + if(any(is.na(m))) stop("some nodes in mergeOutput not in the given dendrogram") + edgeLty <- rep(1, nrow(phyloObj$edge)) + if(mergeMethod != "none" && length(whToMerge) > 0){ + #which of nodes merged + whMerge <- which(phyloObj$node.label %in% nodesToMerge) + nodeNumbers <- (length(phyloObj$tip) + 1):max(phyloObj$edge) + whEdge <- which(phyloObj$edge[,1] %in% nodeNumbers[whMerge]) + edgeLty[whEdge] <- 2 + } + if(mergePlotType == "mergeMethod"){ + if(!mergeMethod %in% methods) stop("mergeMethod not in methods of output") + valsNodes<-as.character(signif(sigInfo[,mergeMethod],2)) + valsNodes[is.na(valsNodes)]<-"NA" #make them print out as NA -- otherwise doesn't plot + phyloObj$node.label[m] <- valsNodes + # offsetDivide<-3 + # dataPct<-.7 + } + if(mergePlotType %in% c("all",.availMergeMethods)) { + meth<-if(mergePlotType=="all") methods else methods[methods%in%mergePlotType] + phyloObj$node.label[m] <- apply(sigInfo[,meth,drop=FALSE],1, + function(x){ + whKp<-which(!is.na(x)) + vals<-paste(paste(meth[whKp], signif(x[whKp],2), sep=":"), collapse="\n") + vals[is.na(vals)]<-"NA" + return(vals) + }) + if(mergePlotType!="all"){ + # offsetDivide<-3 + # dataPct<-.7 + } + else{ + # offsetDivide<-2.5 + # dataPct<-.7 + + } + } + + phyloObj$node.label[-m]<-"" + plotArgs$show.node.label<-TRUE + plotArgs$edge.lty<-edgeLty + } + ############### + ### Deal with clusterLegend object: + ### - Make default if not provided and + ### - If # of clusterings>1 make clusterLegend and cl matrix appropriate + ############### + if(label=="colorblock"){ + clusterLegend<-TRUE #doesn't do anything right now because phydataplot doesn't have option of no legend... + if(is.null(clusterLegendMat)){ + #---- + #make default colors, works for vector or matrix cl + #---- + clusterIds<-sort(unique(as.vector(cl))) + clusterLegendMat<-cbind("clusterIds"=clusterIds,"name"=clusterIds,"color"=bigPalette[1:length(clusterIds)]) + } + else{ + if(is.matrix(cl) && ncol(cl)>1){ + #if not provide list of cluster legends, do only 1st clustering provided (temporary while fixing so works for matrix) + if(!is.list(clusterLegendMat) ) cl<-cl[,1,drop=FALSE] + else{ + #---- + #create one big cl/clusterLegendMat object that will allow for coloring that is okay. + #---- + nclusters<-ncol(cl) + if(length(clusterLegendMat)!=nclusters) stop("Internal coding error -- wrong length of colors for clustering") + newClusterLegendMat<-clusterLegendMat[[1]] + newCl<-cl[,1] + #make it general in case some day want more than just 2 clusterings + for(ii in 2:nclusters){ + #browser() + currMat<-clusterLegendMat[[ii]] + currCl<-cl[,ii] + whExistingColor<-which(currMat[,"color"] %in% newClusterLegendMat[,"color"]) + + if(length(whExistingColor)>0){ + #find new id to give it + matchNew<-match(currMat[whExistingColor,"color"],newClusterLegendMat[,"color"]) + oldId<-currMat[whExistingColor,"clusterIds"] + newId<-newClusterLegendMat[matchNew,"clusterIds"] + mexist<-match(currCl,oldId) + newFullId<-as.numeric(newId[mexist]) + currCl[!is.na(mexist)]<-newFullId[!is.na(mexist)] + + #change name so combination, if not already the same + whDiff<-which(newClusterLegendMat[matchNew,"name"]!=currMat[whExistingColor,"name"]) + if(length(whDiff)>0){ + combName<-paste(newClusterLegendMat[matchNew,"name"],currMat[whExistingColor,"name"],sep="/") + newClusterLegendMat[matchNew[whDiff],"name"]<-combName[whDiff] + + } + + #remove from current color scheme + currMat<-currMat[-whExistingColor,,drop=FALSE] + } + #browser() + if(nrow(currMat)>0){ + ## increase remaing ids + maxNew<-max(as.vector(newCl)) + oldId2<-currMat[,"clusterIds"] + newId2<-seq(from=maxNew+1,by=1,length=length(oldId2)) #replace with this in legend + mexist2<-match(currCl,oldId2) #match old ids to the clusterings vector + newFullId2<-as.numeric(newId2[mexist2]) #will get NAs for those that don't match (e.g. have been changed by previous step) + currCl[!is.na(mexist2)]<-newFullId2[!is.na(mexist2)] + + ## change ids in currMat + currMat[,"clusterIds"]<-newId2 + #browser() + + ## test correct that no overlap in ids or names or colors: + if(any(currMat[,"clusterIds"] %in% newClusterLegendMat[,"clusterIds"])) stop("Internal coding error: still overlap in cluster Ids") + if(any(currMat[,"color"] %in% newClusterLegendMat[,"color"])) stop("Internal coding error: still overlap in color") + + ## add to new cluster color legend + newClusterLegendMat<-rbind(newClusterLegendMat,currMat) + } + newCl<-cbind(newCl,currCl) + + } + #browser() + clusterLegendMat<-newClusterLegendMat + colnames(newCl)<-colnames(cl) + rownames(newCl)<-rownames(cl) + cl<-newCl + clusterLegend<-FALSE + + } + + } + } + } + ############### + ### Deal with clusterLegend object: + ### - Add color of cluster and cluster/sample name to tip labels if labelType=="name" + ### - Make colorMat matrix if labelType=="colorblock" + ############### + edge.width=1 + if(!is.null(clusterLegendMat)){ + if(leafType=="clusters"){ + #get rid of matching string + m<-match(gsub("ClusterId","",phyloObj$tip.label),clusterLegendMat[,"clusterIds"]) + #browser() + if(any(is.na(m))) stop("clusterIds do not match dendrogram labels") + phyloObj$tip.label<-clusterLegendMat[m,"name"] + tip.color<-clusterLegendMat[m,"color"] + if(label=="colorblock"){ + #browser() + clusterLegendMat<-clusterLegendMat[!clusterLegendMat[,"clusterIds"]%in%c(-1,-2),] + colorMat<-matrix(clusterLegendMat[,"name"],ncol=1) + row.names(colorMat)<-clusterLegendMat[,"name"] + cols<-clusterLegendMat[,"color"] + names(cols)<-clusterLegendMat[,"name"] + + + } + + } + if(leafType=="samples"){ + if(is.matrix(cl) && ncol(cl)>1){ + clNames<-row.names(cl) + if(label=="colorblock"){ + colorMat<-apply(cl,2,function(x){ + m<-match(x,clusterLegendMat[,"clusterIds"]) + clusterLegendMat[m,"name"] + }) + if(any(dim(colorMat)!=dim(cl))) stop("Internal coding error: dimensions of colorMat don't match input") + dimnames(colorMat)<-dimnames(cl) + #m<-match(cl[,1],clusterLegendMat[,"clusterIds"]) + cols<-clusterLegendMat[,"color"] + names(cols)<-clusterLegendMat[,"name"] + + } + tip.color<-"black" + #browser() + } + else{ + if(is.matrix(cl)) cl<-cl[,1] + clNames<-names(cl) + m<-match(cl,clusterLegendMat[,"clusterIds"]) + tip.color<-clusterLegendMat[m,"color"] + if(label=="colorblock"){ + colorMat<-matrix(clusterLegendMat[m,"name"],ncol=1) + rownames(colorMat)<-names(cl) + cols<-tip.color + names(cols)<-clusterLegendMat[m,"name"] + } + } + if(label=="colorblock"){ + ntips<-length(phyloObj$tip.label) + whClusterNode<-which(!is.na(phyloObj$node.label) & phyloObj$node.label!="")+ ntips + #only edges going to/from these nodes + whEdgePlot<-which(apply(phyloObj$edge,1,function(x){any(x %in% whClusterNode)})) + edge.width<-rep(0,nrow(phyloObj$edge)) + edge.width[whEdgePlot]<-1 + } + m<-match(phyloObj$tip.label,clNames) + if(any(is.na(m))) stop("names of cl do not match dendrogram labels") + + } + } + else tip.color<-"black" + + + #browser() + ############### + #this next code is hack to deal with error sometimes get if very long edge length -- usually due to unusual distance, etc. + # Divides edge lengths so not too large. + ############### + if(max(phyloObj$edge.length)>1e6) phyloObj$edge.length <- phyloObj$edge.length / max(phyloObj$edge.length) + + prohibitOptions<-c("tip.color","node.pos","edge.width") + if(any(prohibitOptions %in% names(plotArgs))) stop("User cannot set following options to plot.phylo:",paste(prohibitOptions, collapse=",")) + plotArgs<-c(plotArgs,list(tip.color=tip.color,node.pos=2,edge.width=edge.width)) + # browser() + if(label=="name") do.call(ape::plot.phylo,c(list(phyloObj),plotArgs)) + else{#if colorblock + phyloPlotOut<-do.call(ape::plot.phylo,c(list(phyloObj,show.tip.label = FALSE,plot=FALSE),plotArgs)) + treeWidth<-phyloPlotOut$x.lim[2] + do.call(ape::plot.phylo,c(list(phyloObj,show.tip.label = FALSE,x.lim=treeWidth*(1+dataPct)),plotArgs)) + + nclusters<-ncol(colorMat) + colnames(colorMat)<-NULL + if(nclusters==1 & packageVersion("ape")<'4.1.0.6'){ + #this is a temporary hack, because right now function has bug and fails for a 1-column matrix or vector. Have reported this 5/23/2017 and now fixed in new version of ape. + colorMat<-cbind(colorMat,colorMat) + } + + #we have to do this to get order for colors to be what we want! + #basically have to redo code in phydataplot so figure out what order is in plot of the leaves, etc. Poor function. New version of ape fixes this. + getColFun<-function(x,phy,namedColors){ + x <- ape:::.matchDataPhylo(x, phy) + n <- length(phy$tip.label) + one2n <- seq_len(n) + lastPP <- get("last_plot.phylo", envir = ape:::.PlotPhyloEnv) + y1 <- lastPP$yy[one2n] + o <- order(y1) + if(!is.null(ncol(x))) ux<-unique.default(x[o,]) + else ux<-unique.default(x[o]) + m<-match(as.character(ux),names(namedColors)) + namedColors[m] + } + if(packageVersion("ape")<'4.1.0.6') cols<-getColFun(colorMat,phyloObj,cols) + colInput<-function(n){cols} + ape::phydataplot(x=colorMat, phy=phyloObj, style="mosaic",offset=treeWidth*dataPct/offsetDivide, width = treeWidth*dataPct/4, border = NA, lwd = 3,legend = legend, funcol = colInput) + if(nclusters>1 & !is.null(colnames(cl))){ + xloc<-treeWidth+treeWidth*dataPct/offsetDivide+seq(from=0,by=treeWidth*dataPct/4,length=ncol(cl)) + ypos<-par("usr")[4]+0*diff(par("usr")[3:4]) + text(x=xloc,y=ypos,labels=colnames(cl),srt=45,xpd=NA,adj=c(0,0)) + + } + #browser() + + } + + invisible(phyloObj) + } diff --git a/R/plotHeatmap.R b/R/plotHeatmap.R index db6ee3b6..d9dd62bd 100644 --- a/R/plotHeatmap.R +++ b/R/plotHeatmap.R @@ -6,27 +6,28 @@ #' of \code{NMF} package. #' #' @docType methods -#' @param sampleData If input is either a \code{\link{ClusterExperiment}} or +#' @param sampleData If input to \code{data} is either a \code{\link{ClusterExperiment}} or #' \code{SummarizedExperiment} object, then \code{sampleData} must index the #' sampleData stored as a \code{DataFrame} in \code{colData} slot of the #' object. Whether that data is continuous or not will be determined by the -#' properties of \code{colData} (no user input is needed). If input is matrix, +#' properties of \code{colData} (no user input is needed). If input to \code{data} is matrix, #' \code{sampleData} is a matrix of additional data on the samples to show -#' above heatmap. Unless indicated by \code{whSampleDataCont}, +#' above heatmap. In this case, unless indicated by \code{whSampleDataCont}, #' \code{sampleData} will be converted into factors, even if numeric. ``-1'' #' indicates the sample was not assigned to a cluster and gets color #' `unassignedColor' and ``-2`` gets the color 'missingColor'. #' @param data data to use to determine the heatmap. Can be a matrix, #' \code{\link{ClusterExperiment}} or #' \code{\link[SummarizedExperiment]{SummarizedExperiment}} object. The -#' interpretation of parameters depends on the type of the input. +#' interpretation of parameters depends on the type of the input to \code{data}. #' @param whSampleDataCont Which of the \code{sampleData} columns are continuous #' and should not be converted to counts. \code{NULL} indicates no additional -#' \code{sampleData}. -#' @param visualizeData either a character string, indicating what form of the -#' data should be used for visualizing the data (i.e. for making the -#' color-scale), or a data.frame/matrix with same dimensions of -#' \code{assay(data)}. +#' \code{sampleData}. Only used if \code{data} input is matrix. +#' @param visualizeData either a character string, indicating what form of the +#' data should be used for visualizing the data (i.e. for making the +#' color-scale), or a data.frame/matrix with same number of samples as +#' \code{assay(data)}. If a new data.frame/matrix, any character arguments to +#' clusterFeaturesData will be ignored. #' @param clusterSamplesData If \code{data} is a matrix, either a matrix that #' will be used to in \code{hclust} to define the hiearchical clustering of #' samples (e.g. normalized data) or a pre-existing dendrogram that clusters @@ -148,12 +149,18 @@ #' upper quantile of \code{data}, and then all values after the #' 0.9 quantile will be absorbed by the upper-most color bin. This can help to #' reduce the visual impact of a few highly expressed genes (features). -#' @details Note that plotHeatmap calls \code{\link[NMF]{aheatmap}} under the -#' hood. This allows you to plot multiple heatmaps via -#' \code{par(mfrow=c(2,2))}, etc. However, the dendrograms do not resize if -#' you change the size of your plot window in an interactive session of R +#' @details Note that plotHeatmap calls \code{\link[NMF]{aheatmap}} under the +#' hood. This allows you to plot multiple heatmaps via +#' \code{par(mfrow=c(2,2))}, etc. However, the dendrograms do not resize if +#' you change the size of your plot window in an interactive session of R #' (this might be a problem for RStudio if you want to pop it out into a large -#' window...). +#' window...). Also, plotting to a pdf adds a blank page; see help pages of +#' \code{\link[NMF]{aheatmap}} for how to turn this off. +#' @details If you have a factor with many levels, it is important to note that +#' \code{\link[NMF]{aheatmap}} does not recycle colors across factors in the +#' \code{sampleData}, and in fact runs out of colors and the remaining levels +#' get the color white. Thus if you have many factors or many levels in those +#' factors, you should set their colors via \code{clusterLegend}. #' @details Many arguments can be passed on to aheatmap, however, some are set #' internally by \code{plotHeatmap.} In particular, setting the values of #' \code{Rowv} or \code{Colv} will cause errors. \code{color} in @@ -261,56 +268,20 @@ setMethod( .convertTry<-function(x,tryResult){if(!inherits(tryResult,"try-error")) return(tryResult) else return(x)} - #### - ##Transform data and determine which features to use - #### - clusterFeaturesData <- .convertTry(clusterFeaturesData, - try(match.arg(clusterFeaturesData), - silent=TRUE)) - - if(is.list(clusterFeaturesData)){ - groupFeatures<-clusterFeaturesData - clusterFeaturesData<-unlist(clusterFeaturesData) - } - else groupFeatures<-NULL - if(all(clusterFeaturesData %in% c("var","all","PCA"))){ # - dimReduce=switch(clusterFeaturesData, - "var"="var", - "PCA"="PCA", - "all"="none") - if(is.null(nFeatures)) nFeatures<-min(switch(clusterFeaturesData,"var"=500,"all"=nFeatures(data),"PCA"=50),nFeatures(data)) - wh<-1:NROW(data) - } - else{ - if(is.character(clusterFeaturesData)){#gene names - if(is.null(rownames(data))) stop("Cannot give feature names in clusterFeaturesData unless assay(data) has rownames") - else{ - wh<-match(clusterFeaturesData,rownames(data)) - if(all(is.na(wh))) stop("None of the feature names in clusterFeaturesData match rownames(assay(data))") - if(any(is.na(wh))){ - warning("Not all of the feature names in clusterFeaturesData match rownames(assay(data))") - wh<-na.omit(wh) - } - } - } - else{ - if(any(!clusterFeaturesData %in% 1:NROW(data))) stop("invalid indices for clusterFeaturesData") - wh<-clusterFeaturesData - } - dimReduce<-"none" - } - transObj<-.transData(transFun = transformation(data), x=assay(data[wh,]), nPCADims=nFeatures,nVarDims = nFeatures,dimReduce = dimReduce) - if(dimReduce%in%"PCA") wh<-1:nFeatures - if(dimReduce=="var") wh<-transObj$whMostVar #give indices that will pull ######### - ##Assign visualization data and clusterFeaturesData + ##Determine visualization data and default colorScale based on that ######### - #browser() + externalData<-FALSE visualizeData <- .convertTry(visualizeData, try(match.arg(visualizeData), silent=TRUE)) if(is.character(visualizeData)){ if(!visualizeData %in% c("transformed","centeredAndScaled","original")) stop("visualizeData value, '",visualizeData,"',is invalid option") } + else{ + if(!is.data.frame(visualizeData) && !is.matrix(visualizeData)) stop("if visualizeData is not character, must be either data frame or matrix") + externalData<-TRUE + if(!ncol(visualizeData)==ncol(assay(data))) stop("if give separate visualizeData, must have same number of sample (columns) as assay(data)") + } if(missing(colorScale)) { colorScale <- seqPal5 if(is.character(visualizeData)) { @@ -320,61 +291,80 @@ setMethod( } } - if(all(clusterFeaturesData=="PCA")) heatData<-transObj$x - else{ - if(!is.data.frame(visualizeData) && !is.matrix(visualizeData)){ - heatData<-switch(visualizeData, - "original"=assay(data[wh,]), - "transformed"=transObj$x, - "centeredAndScaled"=t(scale(t(transObj$x),center=TRUE,scale=TRUE)) - ) - } - else{ - if(!all(dim(visualizeData)==dim(assay(data)))) stop("if give separate visualizeData, must be of same dimensions as assay(data)") - heatData<-data.matrix(visualizeData)[wh,] #still use the variables identified above. - } + #### + ##Transform data and determine which features to use + #### + clusterFeaturesData <- .convertTry(clusterFeaturesData, + try(match.arg(clusterFeaturesData), + silent=TRUE)) + + if(is.list(clusterFeaturesData)){ + groupFeatures<-clusterFeaturesData + clusterFeaturesData<-unlist(clusterFeaturesData) } + else groupFeatures<-NULL + if(!externalData){ + if(all(clusterFeaturesData %in% c("var","all","PCA"))){ # + dimReduce=switch(clusterFeaturesData, + "var"="var", + "PCA"="PCA", + "all"="none") + if(is.null(nFeatures)) nFeatures<-min(switch(clusterFeaturesData,"var"=500,"all"=nFeatures(data),"PCA"=50),nFeatures(data)) + wh<-1:NROW(data) + } + else{ + if(is.character(clusterFeaturesData)){#gene names + if(is.null(rownames(data))) stop("Cannot give feature names in clusterFeaturesData unless assay(data) has rownames") + else{ + wh<-match(clusterFeaturesData,rownames(data)) + if(all(is.na(wh))) stop("None of the feature names in clusterFeaturesData match rownames(assay(data))") + if(any(is.na(wh))){ + warning("Not all of the feature names in clusterFeaturesData match rownames(assay(data))") + wh<-na.omit(wh) + } + } + } + else{ + if(any(!clusterFeaturesData %in% 1:NROW(data))) stop("invalid indices for clusterFeaturesData") + wh<-clusterFeaturesData + } + dimReduce<-"none" + } + transObj<-.transData(transFun = transformation(data), x=assay(data[wh,]), nPCADims=nFeatures,nVarDims = nFeatures,dimReduce = dimReduce) + if(dimReduce%in%"PCA") wh<-1:nFeatures + if(dimReduce=="var") wh<-transObj$whMostVar #give indices that will pull + if(all(clusterFeaturesData=="PCA")) heatData<-transObj$x + else{ + #note, transObj is already been limited to the wh. + heatData<-switch(visualizeData, + "original"=assay(data[wh,]), + "transformed"=transObj$x, + "centeredAndScaled"=t(scale(t(transObj$x),center=TRUE,scale=TRUE)) + ) + } + } + else{ + heatData<-visualizeData + } + ###### #Make sampleData based on clusterings and columns of colData ###### + #--- #Get clusterings - if(is.character(whichClusters)) whCl<-.TypeIntoIndices(data,whClusters=whichClusters) - else whCl<-whichClusters + #--- + whCl<-.TypeIntoIndices(data,whClusters=whichClusters) if(length(whCl)>0){ - if(!is.numeric(whCl)) stop("invalid whichClusters choices") - if(!all(whCl %in% 1:nClusters(data))) stop("Indices in whichClusters invalid: not in 1 to nClusters(data)") clusterData<-clusterMatrixNamed(data)[,whCl,drop=FALSE] } else{ - if(whichClusters!="none") warning("given whichClusters value does not match any clusters") + if(any( whichClusters!="none")) warning("given whichClusters value does not match any clusters, none will be plotted") clusterData<-NULL } - clLegend<-clusterLegend(data)[whCl] #note, this gives names even though not stored internally so will match, which plotHeatmap needs - if(length(clLegend)==0) clLegend<-NULL - #browser() - #check user didn't give something different for colors - userList<-list(...) - userAlign<-"alignSampleData" %in% names(userList) - userLegend<-"clusterLegend" %in% names(userList) - if(userAlign | userLegend){ #if user asks for alignment, don't assign clusterLegend - if(userLegend){ - userClLegend<-userList[["clusterLegend"]] - #keep existing clLegend from clusterExperiment object if not conflict with user input: - whNotShared<-which(!names(clLegend)%in% names(userClLegend)) - if(length(whNotShared)>0) clLegend<-c(userClLegend,clLegend[whNotShared]) else clLegend<-userClLegend - clLegend<-.convertToAheatmap(clLegend, names=TRUE) - userList<-userList[-grep("clusterLegend",names(userList))] - } - else{ - if(userAlign){ - al<-userList[["alignSampleData"]] - if(al) clLegend<-NULL - } - } - } - - #get colData values + #--- + #get colData values and subset to those asked for by user + #--- sData<-.pullSampleData(data,sampleData) #identify which numeric if(!is.null(sData)) whCont<-which(sapply(1:ncol(sData),function(ii){is.numeric(sData[,ii])})) @@ -393,16 +383,44 @@ setMethod( if(is.null(sData) & is.null(clusterData)) sampleData<-NULL } - + #------ + #check user didn't give something different for colors + #------ + clLegend<-clusterLegend(data)[whCl] #note, clusterLegend gives names even though not stored internally with @clusterLegend so will match, which plotHeatmap needs + if(length(clLegend)==0) clLegend<-NULL + + + userList<-list(...) + userAlign<-"alignSampleData" %in% names(userList) + userLegend<-"clusterLegend" %in% names(userList) + if(userAlign | userLegend){ #if user asks for alignment, don't assign clusterLegend + if(userLegend){ + userClLegend<-userList[["clusterLegend"]] + userClLegend<-userClLegend[names(userClLegend) %in% colnames(sampleData)] + if(length(userClLegend)==0) warning("names of list given by user in clusterLegend do not match clusters nor sampleData chosen. Will be ignored.") + else{ + #keep existing clLegend from clusterExperiment object if not conflict with user input: + whNotShared<-which(!names(clLegend)%in% names(userClLegend)) + if(length(whNotShared)>0) clLegend<-c(userClLegend,clLegend[whNotShared]) else clLegend<-userClLegend + clLegend<-.convertToAheatmap(clLegend, names=TRUE) + userList<-userList[-grep("clusterLegend",names(userList))] + } + } + else{ + if(userAlign){ + al<-userList[["alignSampleData"]] + if(al) clLegend<-NULL + } + } + } + ###### #Create clusterSamplesData ###### clusterSamplesData<-.convertTry(clusterSamplesData,try(match.arg(clusterSamplesData),silent=TRUE)) - #browser() if(is.logical(clusterSamplesData)) clusterSamples<-clusterSamplesData else { clusterSamples<-TRUE - #browser() if(is.numeric(clusterSamplesData)){ heatData<-heatData[,clusterSamplesData,drop=FALSE] if(!is.null(sampleData)) sampleData<-sampleData[clusterSamplesData,,drop=FALSE] @@ -450,12 +468,12 @@ setMethod( blankData<-makeBlankData(heatData,groupFeatures) heatData<-data.matrix(blankData$dataWBlanks) labRow<-blankData$rowNamesWBlanks - #browser() clusterFeatures<-FALSE } else{ labRow<-rownames(heatData) } + do.call("plotHeatmap",c(list(data=heatData, clusterSamplesData=clusterSamplesData, clusterFeaturesData=heatData, #set it so user doesn't try to pass it and have something weird happen because dimensions wrong, etc. @@ -515,13 +533,15 @@ setMethod( } return(val) } - #browser() badValues<-c("Rowv","Colv","color","annCol","annColors") - if(any(badValues %in% names(aHeatmapArgs))) stop("The following arguments to aheatmap cannot be set by the user in plotHeatmap:",paste(badValues,collapse=",")) + replacedValues<-c("clusterSamplesData","clusterFeaturesData","colorScale","sampleData","clusterLegend") + if(any(badValues %in% names(aHeatmapArgs))) stop("The following arguments to aheatmap cannot be set by the user in plotHeatmap:",paste(badValues,collapse=","),". They are over-ridden by: ",paste(replacedValues,collapse=",")) - ###Create the clustering dendrogram: + ########## + ###Create the clustering dendrogram (samples): + ########## if(clusterSamples){ if(inherits(clusterSamplesData, "dendrogram")){ @@ -539,7 +559,6 @@ setMethod( clusterSamplesData<-data.matrix(clusterSamplesData) #check valid if(ncol(clusterSamplesData)!=ncol(heatData)) stop("clusterSamplesData matrix does not have on same number of observations as heatData") -# browser() dendroSamples<-NMF:::cluster_mat(t(clusterSamplesData),param=TRUE,distfun=getHeatmapValue("distfun"),hclustfun=getHeatmapValue("hclustfun"),reorderfun=getHeatmapValue("reorderfun",value=function(d, w) reorder(d, w)))$dendrogram #dendroSamples<-as.dendrogram(stats::hclust(stats::dist(t(clusterSamplesData)))) #dist finds distances between rows @@ -553,6 +572,9 @@ setMethod( if(!is.na(clusterSamples) && clusterSamples && is.null(dendroSamples)) Colv<-TRUE #then just pass the data else Colv<-if(!is.na(clusterSamples) && clusterSamples) dendroSamples else clusterSamples + ########## + ###Create the clustering dendrogram (features): + ########## if(isSymmetric){ Rowv<-Colv Colv<-"Rowv" @@ -589,8 +611,6 @@ setMethod( } - #browser() - ########## @@ -605,41 +625,52 @@ setMethod( if(overRideClusterLimit) warning("More than 10 annotations/clusterings can result in incomprehensible errors in aheamap. You have >10 but have chosen to override the internal stop by setting overRideClusterLimit=TRUE.") else stop("More than 10 annotations/clusterings cannot be reliably shown in plotHeatmap. To override this limitation and try for yourself, set overRideClusterLimit=TRUE.") } - ###Make sampleData explicitly factors, except for whSampleDataCont - ###(not sure why this simpler code doesn't give back data.frame with factors: annCol<-apply(annCol,2,function(x){factor(x)})) - #browser() - #check that no ordered factors... + #------------------- + ###Make sampleData explicitly factors, except for whSampleDataCont + #------------------- + + #--- check that no ordered factors... anyOrdered<-sapply(1:ncol(sampleData),function(ii){is.ordered(sampleData[,ii])}) if(any(anyOrdered)) stop("The function aheatmap in the NMF package that is called to create the heatmap does not currently accept ordered factors (https://github.com/renozao/NMF/issues/83)") - - tmpDf<-do.call("data.frame",lapply(1:ncol(sampleData),function(ii){factor(sampleData[,ii])})) + ###(not sure why this simpler code doesn't give back data.frame with factors: annCol<-apply(annCol,2,function(x){factor(x)})) + tmpDf<-do.call("data.frame", lapply(1:ncol(sampleData), function(ii){ factor(sampleData[,ii]) })) names(tmpDf)<-colnames(sampleData) if(!is.null(whSampleDataCont)){ - if(logical(whSampleDataCont)) whSampleDataCont<-which(whSampleDataCont) - if(length(whSampleDataCont)>0) tmpDf[,whSampleDataCont]<-sampleData[,whSampleDataCont] + if(any(logical(whSampleDataCont))) whSampleDataCont<-which(whSampleDataCont) + if(length(whSampleDataCont)>0) tmpDf[,whSampleDataCont]<-sampleData[,whSampleDataCont] } annCol<-tmpDf - #browser() convertNames <- TRUE - if(is.null(clusterLegend)){ #assign default colors + + ########## + ##Deal with colors ... + ########## + #----- + #assign default colors + #----- + if(is.null(clusterLegend)){ convertNames <- TRUE - if(is.null(whSampleDataCont) || length(whSampleDataCont)0){ whInAnnColors<-setdiff(whInAnnColors,whSampleDataCont) } prunedList<-lapply(whInAnnColors,function(ii){ - nam<-names(annColors)[[ii]] - x<-annColors[[ii]] + nam<-names(annColors)[[ii]] #the name of variable + x<-annColors[[ii]] levs<-levels(annCol[,nam]) - x<-x[levs] + if(length(x)1) red<-red[-match(name,red)] - } - else{# otherwise user meant to do none *as well* as dimReduce with other values. - red<-unique(c("none",red)) #add 'none' and remove NA - ndims<-ndims[!is.na(ndims)] - } - } - dimReduce<<-red - if( name %in% varValues) nVarDims<<- ndims - if(name =="PCA") nPCADims<<-ndims + + origX <- x + #transform data + if(is.null(transFun)){ + transFun <- if(isCount) function(x){log2(x+1)} else function(x){x} + } + + x <- try(transFun(x), silent=TRUE) + + if(inherits(x, "try-error")) + stop("User-supplied `transFun` produces error on the input data matrix:\n",x) + if(any(is.na(x))) + stop("User-supplied `transFun` produces NA values") + + ################### + ###Dim Reduction + ################### + ##Check user inputs + ################### + #check valid options for dimReduce + varValues <- c("var","mad","cv") + if(any(!dimReduce %in% c("none","PCA",varValues))) + stop("invalid options for 'dimReduce' must be one of: 'none','PCA',",paste(varValues,collapse=",")) + + if(any(dimReduce!="none")){ + + ##Function to check and interpret values given + checkValues <- function(name){ + ndims <- switch(as.character(name %in% varValues), + "TRUE"=nVarDims, "FALSE"=nPCADims) + red <- dimReduce + if(any(is.na(ndims)) & name %in% red){ #if NA in ndims + if(length(ndims)==1){ #ndims is only a NA value -- assume user goofed and meant to do just "none" + if(length(red)==1) red<-"none" + if(length(red)>1) red <- red[-match(name, red)] } - #browser() - - lapply(c("PCA",varValues),checkValues) - - dimReduce<-unique(dimReduce) - nVarDims<-unique(nVarDims) - nPCADims<-unique(nPCADims) - - - #browser() - xPCA<-xVAR<-xNone<-NULL #possible values - #logical as to whether return single matrix or list of matrices - listReturn<- !(length(dimReduce)==1 &&( dimReduce=="none" || (dimReduce=="PCA" & length(nPCADims)==1) || (dimReduce %in% varValues & length(nVarDims)==1))) - whFeatures<-NULL - - xCL<-x - if(!is.null(clustering)){ - if(any(!is.numeric(clustering))) stop("clustering vector must be numeric") - if(length(clustering)!=ncol(x)) stop("clustering must be vector of length equal to columns of x") - if(all(clustering<0)) stop("All entries of clustering are negative") - if(sum(clustering<0)==ncol(x)-1) stop("only one value in clustering not negative, cannot do dim reduction") - if(any(clustering<0)) xCL<-x[,-which(clustering<0)] + else{# otherwise user meant to do none *as well* as dimReduce with other values. + red <- unique(c("none", red)) #add 'none' and remove NA + ndims <- ndims[!is.na(ndims)] } - #browser() - ################## - #PCA dim reduction - ################## - if("PCA" %in% dimReduce){ - ######Check dimensions - if(max(nPCADims)>NROW(x)) stop("the number of PCA dimensions must be strictly less than the number of rows of input data matrix") - if(min(nPCADims)<=0) stop("the number of PCA dimensions must be a value greater than 0") - pctReturn<-any(nPCADims<1) - if(max(nPCADims)>100) warning("the number PCA dimensions to be selected is greater than 100. Are you sure you meant to choose to use PCA dimensionality reduction rather than the top most variable features?") - - ######Check zero variance genes: - rowvars <- matrixStats::rowVars(x) - if(any(rowvars==0)) { - if(all(rowvars==0)) { - stop("All features have zero variance.") - } - warning("Found features with zero variance.\nMost likely these are features with 0 across all samples.\nThey will be removed from PCA dimensionality reduction step.") - } - prcObj<-stats::prcomp(t(x[which(rowvars>0),]),center=TRUE,scale=TRUE) - prvar<-prcObj$sdev^2 #variance of each component - prvar<-prvar/sum(prvar) - prc<-t(prcObj$x) - if(NCOL(prc)!=NCOL(origX)) stop("error in coding of principle components.") - if(any(nPCADims<1)) pctReturn<-TRUE - if(!listReturn){ #nPCADims length 1; just return single matrix - if(pctReturn) nPCADims<-which(cumsum(prvar)>nPCADims)[1] #pick first pca coordinate with variance > value - xRet<-prc[1:nPCADims,] - } - else{ - if(pctReturn){ - whPct<-which(nPCADims<1) - pctNDims<-sapply(nPCADims[whPct],function(pct){ - val<-which(cumsum(prvar)>pct)[1] #pick first pca coordinate with variance > value - if(length(val)==0) val<-length(prvar) #in case some numerical problem - return(val) - }) - if(any(is.na(pctNDims))) browser() - nPCADims[whPct]<-pctNDims - } - xPCA<-lapply(nPCADims,function(nn){prc[1:nn,]}) - names(xPCA)<-paste("nPCAFeatures=",nPCADims,sep="") - } - + } + dimReduce <<- red + if( name %in% varValues) nVarDims<<- ndims + if(name =="PCA") nPCADims <<- ndims + } + #browser() + + lapply(c("PCA", varValues), checkValues) + + dimReduce <- unique(dimReduce) + nVarDims <- unique(nVarDims) + nPCADims <- unique(nPCADims) + + xPCA <- xVAR <- xNone <-NULL #possible values + #logical as to whether return single matrix or list of matrices + listReturn<- !(length(dimReduce)==1 && + (dimReduce=="none" || + (dimReduce=="PCA" & length(nPCADims)==1) || + (dimReduce %in% varValues & length(nVarDims)==1))) + whFeatures <- NULL + + xCL <- x + if(!is.null(clustering)){ + if(any(!is.numeric(clustering))) + stop("clustering vector must be numeric") + if(length(clustering)!=ncol(x)) + stop("clustering must be vector of length equal to columns of x") + if(all(clustering<0)) + stop("All entries of clustering are negative") + if(sum(clustering<0)==ncol(x)-1) + stop("only one value in clustering not negative, cannot do dim reduction") + if(any(clustering<0)) + xCL<-x[, -which(clustering<0)] + } + + + ################## + #PCA dim reduction + ################## + + if("PCA" %in% dimReduce){ + + ######Check dimensions + if(max(nPCADims)>NROW(x)) + stop("the number of PCA dimensions must be strictly less than the number of rows of input data matrix") + if(min(nPCADims)<=0) + stop("the number of PCA dimensions must be a value greater than 0") + + pctReturn <- any(nPCADims < 1) + if(max(nPCADims)>100) + warning("the number PCA dimensions to be selected is greater than 100. Are you sure you meant to choose to use PCA dimensionality reduction rather than the top most variable features?") + + ######Check zero variance genes: + rowvars <- matrixStats::rowVars(x) + if(any(rowvars==0)) { + if(all(rowvars==0)) { + stop("All features have zero variance.") } - - ################## - #Feature variability dim reduction - ################## - #for each dim reduction method requested - capwords <- function(s, strict = FALSE) { #From help of tolower - cap <- function(s) paste(toupper(substring(s, 1, 1)), - {s <- substring(s, 2); if(strict) tolower(s) else s}, - sep = "", collapse = " " ) - sapply(strsplit(s, split = " "), cap, USE.NAMES = !is.null(names(s))) + warning("Found features with zero variance.\nMost likely these are features with 0 across all samples.\nThey will be removed from PCA dimensionality reduction step.") + } + if(pctReturn) { + prcObj<-stats::prcomp(t(x[which(rowvars>0),]),center=TRUE,scale=TRUE) + prvar<-prcObj$sdev^2 #variance of each component + prvar<-prvar/sum(prvar) + prc<-t(prcObj$x) + } else { + prc <- t(.pca(t(x[which(rowvars>0),]), center=TRUE, scale=TRUE, + k=max(nPCADims))) + } + + if(pctReturn & NCOL(prc) != NCOL(origX)) + stop("error in coding of principal components.") + + if(any(nPCADims > NCOL(prc))) + stop("error in coding of principal components.") + + if(!listReturn){ #nPCADims length 1; just return single matrix + if(pctReturn) { + nPCADims <- which(cumsum(prvar)>nPCADims)[1] #pick first pca coordinate with variance > value + xRet <- prc[seq_len(nPCADims),] + } else { + xRet <- prc } - doVarReduce<-function(name){ - fun<-switch(name,"var"=stats::var,"mad"=stats::mad,"cv"=function(x){stats::sd(x)/mean(x)}) - - if(name %in% dimReduce){ - if(max(nVarDims)>NROW(xCL)) stop("the number of most variable features must be strictly less than the number of rows of input data matrix") - if(min(nVarDims)<1) stop("the number of most variable features must be equal to 1 or greater") - if(min(nVarDims)<50 & NROW(xCL)>1000) warning("the number of most variable features to be selected is less than 50. Are you sure you meant to choose to use the top most variable features rather than PCA dimensionality reduction?") - varX<-apply(xCL,1,fun) - ord<-order(varX,decreasing=TRUE) - xVarOrdered<-x[ord,] - if(NCOL(xVarOrdered)!=NCOL(origX)) stop("error in coding of most variable.") - if(!listReturn){ #just return single matrix - xRet<-xVarOrdered[1:nVarDims,] - whFeatures<-ord[1:nVarDims] - return(list(x=xRet,whFeatures=whFeatures)) - } - else{ #otherwise make it a list - xLIST<-lapply(nVarDims,function(nn){xVarOrdered[1:nn,]}) - listName<-paste("n",toupper(name),"Features=",sep="") - names(xLIST)<-paste(listName,nVarDims,sep="") - return(xLIST) - } - } - else return(NULL) + } else{ + if(pctReturn){ + whPct <- which(nPCADims<1) + pctNDims <- sapply(nPCADims[whPct], function(pct){ + val<-which(cumsum(prvar)>pct)[1] #pick first pca coordinate with variance > value + if(length(val)==0) val<-length(prvar) #in case some numerical problem + return(val) + }) + if(any(is.na(pctNDims))) browser() + nPCADims[whPct]<-pctNDims } - if(any(dimReduce %in% varValues)){ - dimReduceVar<-dimReduce[dimReduce %in% varValues] - # browser() - if(!listReturn & length(dimReduceVar)==1){ - out<-doVarReduce(dimReduce) - xRet<-out$x - whFeatures<-out$whFeatures - } - else{ - varOut<-lapply(dimReduceVar,doVarReduce) - xVAR<-unlist(varOut,recursive=FALSE) - } + xPCA <- lapply(nPCADims,function(nn){prc[seq_len(nn),]}) + names(xPCA)<-paste("nPCAFeatures=",nPCADims,sep="") + } + } + + ################## + #Feature variability dim reduction + ################## + #for each dim reduction method requested + capwords <- function(s, strict = FALSE) { #From help of tolower + cap <- function(s) paste(toupper(substring(s, 1, 1)), + {s <- substring(s, 2); if(strict) tolower(s) else s}, + sep = "", collapse = " " ) + sapply(strsplit(s, split = " "), cap, USE.NAMES = !is.null(names(s))) + } + doVarReduce<-function(name){ + fun<-switch(name,"var"=stats::var,"mad"=stats::mad,"cv"=function(x){stats::sd(x)/mean(x)}) + + if(name %in% dimReduce){ + if(max(nVarDims)>NROW(xCL)) stop("the number of most variable features must be strictly less than the number of rows of input data matrix") + if(min(nVarDims)<1) stop("the number of most variable features must be equal to 1 or greater") + if(min(nVarDims)<50 & NROW(xCL)>1000) warning("the number of most variable features to be selected is less than 50. Are you sure you meant to choose to use the top most variable features rather than PCA dimensionality reduction?") + varX<-apply(xCL,1,fun) + ord<-order(varX,decreasing=TRUE) + xVarOrdered<-x[ord,] + if(NCOL(xVarOrdered)!=NCOL(origX)) stop("error in coding of most variable.") + if(!listReturn){ #just return single matrix + xRet<-xVarOrdered[1:nVarDims,] + whFeatures<-ord[1:nVarDims] + return(list(x=xRet,whFeatures=whFeatures)) } - # if("var" %in% dimReduce & all(!is.na(nVarDims))){ #do PCA dim reduction - # if(max(nVarDims)>NROW(x)) stop("the number of most variable features must be strictly less than the number of rows of input data matrix") - # if(min(nVarDims)<1) stop("the number of most variable features must be equal to 1 or greater") - # if(min(nVarDims)<50 & NROW(x)>1000) warning("the number of most variable features to be selected is less than 50. Are you sure you meant to choose to use the top most variable features rather than PCA dimensionality reduction?") - # varX<-apply(x,1,mad) - # ord<-order(varX,decreasing=TRUE) - # xVarOrdered<-x[ord,] - # if(NCOL(xVarOrdered)!=NCOL(origX)) stop("error in coding of principle components.") - # if(length(nVarDims)==1 & length(dimReduce)==1){ #just return single matrix - # x<-xVarOrdered[1:nVarDims,] - # whFeatures<-ord[1:nVarDims] - # - # } - # else{ #otherwise make it a list - # xVAR<-lapply(nVarDims,function(nn){xVarOrdered[1:nn,]}) - # names(xVAR)<-paste("nVarFeatures=",nVarDims,sep="") - # listReturn<-TRUE - # } - # } - if("none" %in% dimReduce){ - if(listReturn) xNone<-list("noDimReduce"=x) - else xRet<-x + else{ #otherwise make it a list + xLIST<-lapply(nVarDims,function(nn){xVarOrdered[1:nn,]}) + listName<-paste("n",toupper(name),"Features=",sep="") + names(xLIST)<-paste(listName,nVarDims,sep="") + return(xLIST) } - + } + else return(NULL) } - else{ - listReturn<-FALSE - whFeatures<-NULL - xRet<-x - + if(any(dimReduce %in% varValues)){ + dimReduceVar<-dimReduce[dimReduce %in% varValues] + # browser() + if(!listReturn & length(dimReduceVar)==1){ + out<-doVarReduce(dimReduce) + xRet<-out$x + whFeatures<-out$whFeatures + } + else{ + varOut<-lapply(dimReduceVar,doVarReduce) + xVAR<-unlist(varOut,recursive=FALSE) + } } - #browser() - - if(listReturn) xRet<-c(xNone,xVAR,xPCA) - return(list(x=xRet,transFun=transFun,whMostVar=whFeatures)) + # if("var" %in% dimReduce & all(!is.na(nVarDims))){ #do PCA dim reduction + # if(max(nVarDims)>NROW(x)) stop("the number of most variable features must be strictly less than the number of rows of input data matrix") + # if(min(nVarDims)<1) stop("the number of most variable features must be equal to 1 or greater") + # if(min(nVarDims)<50 & NROW(x)>1000) warning("the number of most variable features to be selected is less than 50. Are you sure you meant to choose to use the top most variable features rather than PCA dimensionality reduction?") + # varX<-apply(x,1,mad) + # ord<-order(varX,decreasing=TRUE) + # xVarOrdered<-x[ord,] + # if(NCOL(xVarOrdered)!=NCOL(origX)) stop("error in coding of principle components.") + # if(length(nVarDims)==1 & length(dimReduce)==1){ #just return single matrix + # x<-xVarOrdered[1:nVarDims,] + # whFeatures<-ord[1:nVarDims] + # + # } + # else{ #otherwise make it a list + # xVAR<-lapply(nVarDims,function(nn){xVarOrdered[1:nn,]}) + # names(xVAR)<-paste("nVarFeatures=",nVarDims,sep="") + # listReturn<-TRUE + # } + # } + if("none" %in% dimReduce){ + if(listReturn) xNone<-list("noDimReduce"=x) + else xRet<-x + } + + } + else{ + listReturn<-FALSE + whFeatures<-NULL + xRet<-x + + } + + + if(listReturn) xRet<-c(xNone,xVAR,xPCA) + return(list(x=xRet,transFun=transFun,whMostVar=whFeatures)) +} + +.pca <- function(x, center=TRUE, scale=FALSE, k) { + svd_raw <- svds(scale(x, center=center, scale=scale), k=k, nu=k, nv=0) + pc_raw <- svd_raw$u %*% diag(svd_raw$d, nrow = length(svd_raw$d)) + rownames(pc_raw) <- rownames(x) + return(pc_raw) } diff --git a/README.md b/README.md index 753eda7d..b5b427d4 100644 --- a/README.md +++ b/README.md @@ -23,8 +23,15 @@ While we generally try to keep the bioconductor devel version up-to-date with th library(devtools) install_github("epurdom/clusterExperiment") ``` + +## Development branch: + +The `develop` branch is our development branch where we are actively updating features, and may contain bugs. You should not use the `develop` branch unless it passes TravisCI checks and you want to be using a *very* beta version. + ## Status +Below are the status checks. Note that occassionally errors do not appear here immediately. Clicking on the link will give you the most up-to-date status. + | Resource: | Travis CI | | ------------- | ------------ | | R CMD check master | [![Build Status](https://travis-ci.org/epurdom/clusterExperiment.svg?branch=master)](https://travis-ci.org/epurdom/clusterExperiment)| diff --git a/man/ClusterExperiment-class.Rd b/man/ClusterExperiment-class.Rd index cdfde1e8..d225310d 100644 --- a/man/ClusterExperiment-class.Rd +++ b/man/ClusterExperiment-class.Rd @@ -26,7 +26,8 @@ clusterExperiment(se, clusters, ...) \S4method{clusterExperiment}{SummarizedExperiment,matrix}(se, clusters, transformation, primaryIndex = 1, clusterTypes = "User", clusterInfo = NULL, orderSamples = 1:ncol(se), dendro_samples = NULL, - dendro_index = NA_real_, dendro_clusters = NULL, coClustering = NULL) + dendro_index = NA_real_, dendro_clusters = NULL, dendro_outbranch = NA, + coClustering = NULL) } \arguments{ \item{se}{a matrix or \code{SummarizedExperiment} containing the data to be @@ -62,6 +63,8 @@ Slots).} \item{dendro_clusters}{dendrogram. Sets the `dendro_clusters` slot (see Slots).} +\item{dendro_outbranch}{logical. Sets the `dendro_outbranch` slot (see Slots)} + \item{coClustering}{matrix. Sets the `coClustering` slot (see Slots).} } \value{ @@ -137,6 +140,9 @@ details).} \item{\code{dendro_index}}{numeric. An integer giving the cluster that was used to make the dendrograms. NA_real_ value if no dendrograms are saved.} +\item{\code{dendro_outbranch}}{logical. Whether the dendro_samples dendrogram put +missing/non-clustered samples in an outbranch, or intermixed in the dendrogram.} + \item{\code{coClustering}}{matrix. A matrix with the cluster co-occurrence information; this can either be based on subsampling or on co-clustering across parameter sets (see \code{clusterMany}). The matrix is a square matrix diff --git a/man/ClusterExperiment-methods.Rd b/man/ClusterExperiment-methods.Rd index 0d56e984..ec203021 100644 --- a/man/ClusterExperiment-methods.Rd +++ b/man/ClusterExperiment-methods.Rd @@ -14,6 +14,8 @@ \alias{primaryClusterNamed} \alias{transformation,ClusterExperiment-method} \alias{transformation} +\alias{transformation<-,ClusterExperiment,function-method} +\alias{transformation<-} \alias{nClusters,ClusterExperiment-method} \alias{nClusters} \alias{nFeatures,ClusterExperiment-method} @@ -28,6 +30,8 @@ \alias{primaryCluster} \alias{primaryClusterIndex,ClusterExperiment-method} \alias{primaryClusterIndex} +\alias{dendroClusterIndex,ClusterExperiment-method} +\alias{dendroClusterIndex} \alias{primaryClusterIndex<-,ClusterExperiment,numeric-method} \alias{primaryClusterIndex<-} \alias{coClustering,ClusterExperiment-method} @@ -68,6 +72,8 @@ \S4method{transformation}{ClusterExperiment}(x) +\S4method{transformation}{ClusterExperiment,`function`}(object) <- value + \S4method{nClusters}{ClusterExperiment}(x) \S4method{nFeatures}{ClusterExperiment}(x) @@ -84,6 +90,8 @@ \S4method{primaryClusterIndex}{ClusterExperiment}(x) +\S4method{dendroClusterIndex}{ClusterExperiment}(x) + \S4method{primaryClusterIndex}{ClusterExperiment,numeric}(object) <- value \S4method{coClustering}{ClusterExperiment}(x) @@ -114,6 +122,10 @@ \item{..., i, j, drop}{Forwarded to the \code{\link[SummarizedExperiment]{SummarizedExperiment}} method.} +\item{value}{The value to be substituted in the corresponding slot. See the +slot descriptions in \code{\link{ClusterExperiment}} for details on what +objects may be passed to these functions.} + \item{whichClusters}{optional argument that can be either numeric or character value. If numeric, gives the indices of the \code{clusterMatrix} to return; this can also be used to defined an ordering for the @@ -123,10 +135,6 @@ clusterings. \code{whichClusters} can be a character value identifying the 'all' or 'workflow' to indicate choosing all clusters or choosing all \code{\link{workflowClusters}}. If missing, the entire matrix of all clusterings is returned.} - -\item{value}{The value to be substituted in the corresponding slot. See the -slot descriptions in \code{\link{ClusterExperiment}} for details on what -objects may be passed to these functions.} } \value{ \code{clusterMatrixNamed} returns a matrix with cluster labels. @@ -155,6 +163,10 @@ clusterMatrix). \code{primaryClusterIndex} returns/sets the primary clustering index (i.e., which column of clusterMatrix corresponds to the primary clustering). +\code{dendroClusterIndex} returns/sets the clustering index +of the clusters used to create dendrogram +(i.e., which column of clusterMatrix corresponds to the clustering). + \code{coClustering} returns/sets the co-clustering matrix. \code{clusterTypes} returns/sets the clusterTypes slot. diff --git a/man/RSEC.Rd b/man/RSEC.Rd index 04c54083..af52cbd5 100644 --- a/man/RSEC.Rd +++ b/man/RSEC.Rd @@ -7,6 +7,7 @@ \alias{RSEC,ClusterExperiment-method} \alias{RSEC,matrix-method} \alias{RSEC,SummarizedExperiment-method} +\alias{RSEC,data.frame-method} \alias{RSEC,ClusterExperiment-method} \title{Resampling-based Sequential Ensemble Clustering} \usage{ @@ -21,6 +22,8 @@ \S4method{RSEC}{SummarizedExperiment}(x, ...) +\S4method{RSEC}{data.frame}(x, ...) + \S4method{RSEC}{ClusterExperiment}(x, eraseOld = FALSE, rerunClusterMany = FALSE, ...) } @@ -42,7 +45,7 @@ dimensionality reduction to perform before clustering. Options are "none","PCA", "var","cv", and "mad". See \code{\link{transform}} for more details.} -\item{nVarDims}{vector of the number of the most variable features to keep +\item{nVarDims}{vector of the number of the most variable features to keep (when "var", "cv", or "mad" is identified in \code{dimReduce}). If NA is included, then the full dataset will also be included.} @@ -52,14 +55,14 @@ included.} \item{k0s}{the k0 parameter for sequential clustering (see \code{\link{seqCluster}})} -\item{clusterFunction}{function used for the clustering. Note that unlike in +\item{clusterFunction}{function used for the clustering. Note that unlike in \code{\link{clusterSingle}}, this must be a character vector of pre-defined clustering techniques provided by \code{\link{clusterSingle}}, and can not be a user-defined function. Current functions are "tight", "hierarchical01","hierarchicalK", and "pam"} -\item{alphas}{values of alpha to be tried. Only used for clusterFunctions of -type '01' (either 'tight' or 'hierarchical01'). Determines tightness +\item{alphas}{values of alpha to be tried. Only used for clusterFunctions of +type '01' (either 'tight' or 'hierarchical01'). Determines tightness required in creating clusters from the dissimilarity matrix. Takes on values in [0,1]. See \code{\link{clusterD}}.} @@ -97,7 +100,7 @@ left unassigned.} \item{ncores}{the number of threads} -\item{random.seed}{a value to set seed before each run of clusterSingle (so +\item{random.seed}{a value to set seed before each run of clusterSingle (so that all of the runs are run on the same subsample of the data). Note, if 'random.seed' is set, argument 'ncores' should NOT be passed via subsampleArgs; instead set the argument 'ncores' of diff --git a/man/clusterMany.Rd b/man/clusterMany.Rd index eedd33cc..a93f53b0 100644 --- a/man/clusterMany.Rd +++ b/man/clusterMany.Rd @@ -7,6 +7,7 @@ \alias{clusterMany,list-method} \alias{clusterMany,ClusterExperiment-method} \alias{clusterMany,SummarizedExperiment-method} +\alias{clusterMany,data.frame-method} \title{Create a matrix of clustering across values of parameters} \usage{ \S4method{clusterMany}{matrix}(x, dimReduce = "none", nVarDims = NA, @@ -24,6 +25,8 @@ \S4method{clusterMany}{SummarizedExperiment}(x, dimReduce = "none", nVarDims = NA, nPCADims = NA, transFun = NULL, isCount = FALSE, ...) + +\S4method{clusterMany}{data.frame}(x, ...) } \arguments{ \item{x}{the data on which to run the clustering. Can be: matrix (with genes @@ -35,7 +38,7 @@ dimensionality reduction to perform before clustering. Options are "none","PCA", "var","cv", and "mad". See \code{\link{transform}} for more details.} -\item{nVarDims}{vector of the number of the most variable features to keep +\item{nVarDims}{vector of the number of the most variable features to keep (when "var", "cv", or "mad" is identified in \code{dimReduce}). If NA is included, then the full dataset will also be included.} @@ -58,14 +61,14 @@ method for signature \code{list}.} \item{ks}{the range of k values (see details for meaning for different choices).} -\item{clusterFunction}{function used for the clustering. Note that unlike in +\item{clusterFunction}{function used for the clustering. Note that unlike in \code{\link{clusterSingle}}, this must be a character vector of pre-defined clustering techniques provided by \code{\link{clusterSingle}}, and can not be a user-defined function. Current functions are "tight", "hierarchical01","hierarchicalK", and "pam"} -\item{alphas}{values of alpha to be tried. Only used for clusterFunctions of -type '01' (either 'tight' or 'hierarchical01'). Determines tightness +\item{alphas}{values of alpha to be tried. Only used for clusterFunctions of +type '01' (either 'tight' or 'hierarchical01'). Determines tightness required in creating clusters from the dissimilarity matrix. Takes on values in [0,1]. See \code{\link{clusterD}}.} @@ -86,10 +89,10 @@ iteration; otherwise the distance function will be determined by argument \item{silCutoff}{Requirement on minimum silhouette width to be included in cluster (only if removeSil=TRUE).} -\item{distFunction}{a vector of character strings that are the names of +\item{distFunction}{a vector of character strings that are the names of distance functions found in the global environment. See the help pages of -\code{\link{clusterD}} for details about the required format of distance -functions. Currently, this distance function must be applicable for all +\code{\link{clusterD}} for details about the required format of distance +functions. Currently, this distance function must be applicable for all clusterFunction types tried. Therefore, it is not possible to intermix type "K" and type "01" algorithms if you also give distances to evaluate via \code{distFunction} unless all distances give 0-1 values for the distance @@ -117,7 +120,7 @@ left unassigned.} \item{ncores}{the number of threads} -\item{random.seed}{a value to set seed before each run of clusterSingle (so +\item{random.seed}{a value to set seed before each run of clusterSingle (so that all of the runs are run on the same subsample of the data). Note, if 'random.seed' is set, argument 'ncores' should NOT be passed via subsampleArgs; instead set the argument 'ncores' of diff --git a/man/makeDendrogram.Rd b/man/makeDendrogram.Rd index 75265733..55a86aec 100644 --- a/man/makeDendrogram.Rd +++ b/man/makeDendrogram.Rd @@ -1,12 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/makeDendrogram.R \docType{methods} -\name{makeDendrogram,ClusterExperiment-method} -\alias{makeDendrogram,ClusterExperiment-method} +\name{makeDendrogram} \alias{makeDendrogram} +\alias{makeDendrogram,ClusterExperiment-method} \alias{makeDendrogram,matrix-method} -\alias{plotDendrogram,ClusterExperiment-method} -\alias{plotDendrogram} \title{Make hierarchy of set of clusters} \usage{ \S4method{makeDendrogram}{ClusterExperiment}(x, @@ -16,16 +14,13 @@ \S4method{makeDendrogram}{matrix}(x, cluster, unassignedSamples = c("outgroup", "cluster", "remove"), ...) - -\S4method{plotDendrogram}{ClusterExperiment}(x, leaves = c("clusters", - "samples"), clusterNames = TRUE, main, sub, ...) } \arguments{ -\item{x}{data to define the medoids from. Matrix and +\item{x}{data to define the medoids from. Matrix and \code{\link{ClusterExperiment}} supported.} -\item{whichCluster}{an integer index or character string that identifies -which cluster should be used to make the dendrogram. Default is +\item{whichCluster}{an integer index or character string that identifies +which cluster should be used to make the dendrogram. Default is primaryCluster.} \item{dimReduce}{character A character identifying what type of @@ -36,32 +31,22 @@ details.} \item{ndims}{integer An integer identifying how many dimensions to reduce to in the reduction specified by \code{dimReduce}} -\item{ignoreUnassignedVar}{logical indicating whether dimensionality reduction -via top feature variability (i.e. 'var','cv','mad') should ignore -unassigned samples in the primary clustering for calculation of the top +\item{ignoreUnassignedVar}{logical indicating whether dimensionality reduction +via top feature variability (i.e. 'var','cv','mad') should ignore +unassigned samples in the primary clustering for calculation of the top features.} -\item{unassignedSamples}{how to handle unassigned samples("-1") ; only relevant -for sample clustering. See details.} +\item{unassignedSamples}{how to handle unassigned samples("-1") ; only +relevant for sample clustering. See details.} -\item{...}{for makeDendrogram, if signature \code{matrix}, arguments passed -to hclust; if signature \code{ClusterExperiment} passed to the method for -signature \code{matrix}. For plotDendrogram, passed to \code{\link{plot.dendrogram}}.} +\item{...}{for makeDendrogram, if signature \code{matrix}, arguments passed +to hclust; if signature \code{ClusterExperiment} passed to the method for +signature \code{matrix}. For plotDendrogram, passed to +\code{\link{plot.dendrogram}}.} -\item{cluster}{A numeric vector with cluster assignments. If x is a -ClusterExperiment object, cluster is automatically the primaryCluster(x). +\item{cluster}{A numeric vector with cluster assignments. If x is a +ClusterExperiment object, cluster is automatically the primaryCluster(x). ``-1'' indicates the sample was not assigned to a cluster.} - -\item{leaves}{if "samples" the dendrogram has one leaf per sample, otherwise -it has one per cluster.} - -\item{clusterNames}{logical. If \code{leaves="clusters"}, then clusters will -be identified with their 'name' value in legend; otherwise the 'clusterIds' -value will be used.} - -\item{main}{passed to the \code{plot} function.} - -\item{sub}{passed to the \code{plot} function.} } \value{ If x is a matrix, a list with two dendrograms, one in which the @@ -69,8 +54,8 @@ leaves are clusters and one in which the leaves are samples. If x is a ClusterExperiment object, the dendrograms are saved in the appropriate slots. } \description{ -Makes a dendrogram of a set of clusters based on hclust on the -medoids of the cluster. +Makes a dendrogram of a set of clusters based on hclust on the + medoids of the cluster. } \details{ The function returns two dendrograms (as a list if x is a matrix or @@ -90,11 +75,6 @@ that samples with "-1" should be discarded. This is not a permitted option, however, when \code{x} is a \code{ClusterExperiment} object, because it would return a dendrogram with less samples than \code{NCOL(x)}, which is not permitted for the \code{@dendro_samples} slot. - -If \code{leaves="clusters"}, the plotting function will work best if - the clusters in the dendrogram correspond to the primary cluster. This is - because the function colors the cluster labels based on the colors of the - clusterIds of the primaryCluster } \examples{ data(simData) @@ -106,6 +86,6 @@ sequential=FALSE, clusterDArgs=list(k=8)) #create dendrogram of clusters: hcl <- makeDendrogram(cl) plotDendrogram(hcl) -plotDendrogram(hcl, leaves="samples") +plotDendrogram(hcl, leafType="samples",labelType="colorblock") } diff --git a/man/mergeClusters.Rd b/man/mergeClusters.Rd index 65e89d0b..d50317cd 100644 --- a/man/mergeClusters.Rd +++ b/man/mergeClusters.Rd @@ -9,49 +9,56 @@ \usage{ \S4method{mergeClusters}{matrix}(x, cl, dendro = NULL, mergeMethod = c("none", "adjP", "locfdr", "MB", "JC"), - plotType = c("none", "all", "mergeMethod", "adjP", "locfdr", "MB", "JC"), - cutoff = 0.1, doPlot = TRUE, isCount = TRUE, ...) + plotInfo = c("none", "all", "mergeMethod", "adjP", "locfdr", "MB", "JC"), + cutoff = 0.1, plot = TRUE, isCount = TRUE, ...) \S4method{mergeClusters}{ClusterExperiment}(x, eraseOld = FALSE, - isCount = FALSE, mergeMethod = "none", plotType = "all", - clusterLabel = "mergeClusters", ...) + isCount = FALSE, mergeMethod = "none", plotInfo = "all", + clusterLabel = "mergeClusters", leafType = c("samples", "clusters"), + labelType = c("colorblock", "name", "ids"), plot = TRUE, ...) } \arguments{ -\item{x}{data to perform the test on. It can be a matrix or a +\item{x}{data to perform the test on. It can be a matrix or a \code{\link{ClusterExperiment}}.} -\item{cl}{A numeric vector with cluster assignments to compare to. ``-1'' +\item{cl}{A numeric vector with cluster assignments to compare to. ``-1'' indicates the sample was not assigned to a cluster.} -\item{dendro}{dendrogram providing hierarchical clustering of clusters in cl; -The default for matrix (NULL) is to recalculate it with the given (x, cl) -pair. If x is a \code{\link{ClusterExperiment}} object, the dendrogram in -the slot \code{dendro_clusters} will be used. This means that -\code{\link{makeDendrogram}} needs to be called before -\code{mergeClusters}.} +\item{dendro}{dendrogram providing hierarchical clustering of clusters in cl. +If x is a matrix, then the default is \code{dendro=NULL} and the function +will calculate the dendrogram with the given (x, cl) pair using +\code{\link{makeDendrogram}}. If x is a \code{\link{ClusterExperiment}} +object, the dendrogram in the slot \code{dendro_clusters} will be used. In +this case, this means that \code{\link{makeDendrogram}} needs to be called +before \code{mergeClusters}.} \item{mergeMethod}{method for calculating proportion of non-null that will be -used to merge clusters (if 'none', no merging will be done). See details +used to merge clusters (if 'none', no merging will be done). See details for description of methods.} -\item{plotType}{what type of plotting of dendrogram. If 'all', then all the -estimates of proportion non-null will be plotted; if 'mergeMethod', then -only the value used in the merging is plotted for each node.} +\item{plotInfo}{what type of information about the merging will be shown on +the dendrogram. If 'all', then all the estimates of proportion non-null +will be plotted at each node of the dendrogram; if 'mergeMethod', then only +the value used in the merging is plotted at each node. If 'none', then no +proportions will be added to the dendrogram. 'plotInfo' can also be one of +the mergeMethod choices (even if that method is not the method chosen in +'mergeMethod' options).} -\item{cutoff}{minimimum value required for NOT merging a cluster, i.e. -two clusters with the proportion of DE below cutoff will be merged. -Must be a value between 0, 1, where -lower values will make it harder to merge clusters.} +\item{cutoff}{minimimum value required for NOT merging a cluster, i.e. two +clusters with the proportion of DE below cutoff will be merged. Must be a +value between 0, 1, where lower values will make it harder to merge +clusters.} -\item{doPlot}{logical as to whether to plot the dendrogram (overrides -\code{plotType} value). Mainly used for internal coding purposes.} +\item{plot}{logical as to whether to plot the dendrogram with the merge +results} -\item{isCount}{logical as to whether input data is a count matrix. See details.} +\item{isCount}{logical as to whether input data is a count matrix. See +details.} -\item{...}{for signature \code{matrix}, arguments passed to the -\code{\link{plot.phylo}} function of \code{ade4} that plots the dendrogram. -For signature \code{ClusterExperiment} arguments passed to the method for -signature \code{matrix}.} +\item{...}{for signature \code{matrix}, arguments passed to the +\code{\link{plot.phylo}} function of \code{ape} that plots the dendrogram. +For signature \code{ClusterExperiment} arguments passed to the method for +signature \code{matrix} and then onto \code{\link{plot.phylo}}.} \item{eraseOld}{logical. Only relevant if input \code{x} is of class \code{ClusterExperiment}. If TRUE, will erase existing workflow results @@ -60,53 +67,73 @@ workflow results will have "\code{_i}" added to the clusterTypes value, where \code{i} is one more than the largest such existing workflow clusterTypes.} -\item{clusterLabel}{a string used to describe the type of clustering. By +\item{clusterLabel}{a string used to describe the type of clustering. By default it is equal to "mergeClusters", to indicate that this clustering is -the result of a call to mergeClusters.} +the result of a call to mergeClusters (only if x is a ClusterExperiment object)} + +\item{leafType}{if plotting, whether the leaves should be the clusters or the +samples. Choosing 'samples' allows for visualization of how many samples +are in the merged clusters (only if x is a ClusterExperiment object), which +is the main difference between choosing "clusters" and "samples", +particularly if \code{labelType="colorblock"}} + +\item{labelType}{if plotting, then whether leaves of dendrogram should be +labeled by rectangular blocks of color ("colorblock") or with the names of +the leaves ("name") (only if x is a ClusterExperiment object).} } \value{ -If `x` is a matrix, it returns (invisibly) a list with elements - \itemize{ \item{\code{clustering}}{ a vector of length equal to ncol(x) - giving the integer-valued cluster ids for each sample. "-1" indicates the - sample was not clustered.} \item{\code{oldClToNew}}{ A table of the old +If `x` is a matrix, it returns (invisibly) a list with elements + \itemize{ \item{\code{clustering}}{ a vector of length equal to ncol(x) + giving the integer-valued cluster ids for each sample. "-1" indicates the + sample was not clustered.} \item{\code{oldClToNew}}{ A table of the old cluster labels to the new cluster labels.} \item{\code{propDE}}{ A table of - the proportions that are DE on each node.} - \item{\code{originalClusterDendro}}{ The dendrogram on which the merging + the proportions that are DE on each node.} + \item{\code{originalClusterDendro}}{ The dendrogram on which the merging was based (based on the original clustering).} } -If `x` is a \code{\link{ClusterExperiment}}, it returns a new - \code{ClusterExperiment} object with an additional clustering based on the +If `x` is a \code{\link{ClusterExperiment}}, it returns a new + \code{ClusterExperiment} object with an additional clustering based on the merging. This becomes the new primary clustering. } \description{ -Takes an input of hierarchical clusterings of clusters and - returns estimates of number of proportion of non-null and merges those +Takes an input of hierarchical clusterings of clusters and + returns estimates of number of proportion of non-null and merges those below a certain cutoff. } \details{ -If \code{isCount=TRUE}, and the input is a matrix, - \code{log2(count + 1)} will be used for \code{\link{makeDendrogram}} and the - original data with voom correction will be used in - \code{\link{getBestFeatures}}). If input is - \code{\link{ClusterExperiment}}, then setting \code{isCount=TRUE} also means - that the log2(1+count) will be used as the transformation, like for - the matrix case as well as the voom calculation, and will NOT use the - transformation stored in the object. If FALSE, then transform(x) will be - given to the input and will be used for both \code{makeDendrogram} and +If \code{isCount=TRUE}, and the input is a matrix, \code{log2(count + + 1)} will be used for \code{\link{makeDendrogram}} and the original data + with voom correction will be used in \code{\link{getBestFeatures}}). If + input is \code{\link{ClusterExperiment}}, then setting \code{isCount=TRUE} + also means that the log2(1+count) will be used as the transformation, like + for the matrix case as well as the voom calculation, and will NOT use the + transformation stored in the object. If FALSE, then transform(x) will be + given to the input and will be used for both \code{makeDendrogram} and \code{getBestFeatures}, with no voom correction. -"JC" refers to the method of Ji and Cai (2007), and implementation - of "JC" method is copied from code available on Jiashin Ji's website, - December 16, 2015 +"JC" refers to the method of Ji and Cai (2007), and implementation + of "JC" method is copied from code available on Jiashin Ji's website, + December 16, 2015 (http://www.stat.cmu.edu/~jiashun/Research/software/NullandProp/). "locfdr" - refers to the method of Efron (2004) and is implemented in the package + refers to the method of Efron (2004) and is implemented in the package \code{\link{locfdr}}. "MB" refers to the method of Meinshausen and Buhlmann - (2005) and is implemented in the package \code{\link{howmany}}. "adjP" + (2005) and is implemented in the package \code{\link{howmany}}. "adjP" refers to the proportion of genes that are found significant based on a FDR adjusted p-values (method "BH") and a cutoff of 0.05. -If \code{mergeMethod} is not equal to 'none' then the plotting will - indicate where the clusters will be merged (assuming \code{plotType} is not 'none'). +If \code{mergeMethod} is not equal to 'none' then the plotting will + indicate where the clusters will be merged (assuming \code{plotInfo} is not + 'none'). Note setting both 'mergeMethod' and 'plotInfo' to 'none' will + cause function to stop, because nothing is asked to be done. If you just + want plot of the dendrogram, with no merging performed or demonstrated on + the plot, see \code{\link{plotDendrogram}}. + +If the dendrogram was made with option + \code{unassignedSamples="cluster"} (i.e. unassigned were clustered in with + other samples), then you cannot choose the option + \code{leafType='samples'}. This is because the current code cannot reliably + link up the internal nodes of the sample dendrogram to the internal nodes + of the cluster dendrogram when the unassigned samples are intermixed. } \examples{ data(simData) @@ -115,14 +142,27 @@ data(simData) cl<-clusterSingle(simData, clusterFunction="pam", subsample=FALSE, sequential=FALSE, clusterDArgs=list(k=8)) +#give more interesting names to clusters: +newNames<- paste("Cluster",clusterLegend(cl)[[1]][,"name"],sep="") +clusterLegend(cl)[[1]][,"name"]<-newNames #make dendrogram cl <- makeDendrogram(cl) -#merge clusters with plotting. Note argument 'use.edge.length' can improve -#readability -merged <- mergeClusters(cl, plot=TRUE, plotType="all", +#plot showing the before and after clustering +#(Note argument 'use.edge.length' can improve +#readability) +merged <- mergeClusters(cl, plotInfo="all", mergeMethod="adjP", use.edge.length=FALSE) +#Simpler plot with just dendrogram and single method +merged <- mergeClusters(cl, plotInfo="mergeMethod", +mergeMethod="adjP", use.edge.length=FALSE, +leafType="clusters",label="name") + #compare merged to original table(primaryCluster(cl), primaryCluster(merged)) + +} +\seealso{ +makeDendrogram, plotDendrogram, getBestFeatures } diff --git a/man/plotDendrogram.Rd b/man/plotDendrogram.Rd new file mode 100644 index 00000000..edab8b0e --- /dev/null +++ b/man/plotDendrogram.Rd @@ -0,0 +1,77 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotDendrogram.R +\docType{methods} +\name{plotDendrogram,ClusterExperiment-method} +\alias{plotDendrogram,ClusterExperiment-method} +\alias{plotDendrogram} +\title{Plot dendrogram of clusterExperiment object} +\usage{ +\S4method{plotDendrogram}{ClusterExperiment}(x, whichClusters = "dendro", + leafType = c("clusters", "samples"), labelType = c("name", "colorblock", + "ids"), main, sub, removeOutbranch = TRUE, legend = "side", ...) +} +\arguments{ +\item{x}{a \code{\link{ClusterExperiment}} object.} + +\item{whichClusters}{only used if \code{leafType="samples"}). If numeric, an +index for the clusterings to be plotted with dendrogram. Otherwise, +\code{whichClusters} can be a character value identifying the +\code{clusterTypes} to be used, or if not matching \code{clusterTypes} then +\code{clusterLabels}; alternatively \code{whichClusters} can be either +'all' or 'workflow' or 'primaryCluster' to indicate choosing all clusters +or choosing all \code{\link{workflowClusters}}. Default 'dendro' indicates +using the clustering that created the dendrogram.} + +\item{leafType}{if "samples" the dendrogram has one leaf per sample, +otherwise it has one per cluster.} + +\item{labelType}{one of 'name', 'colorblock' or 'id'. If 'Name' then +dendrogram will be plotted, and name of cluster or sample (depending on +type of value for \code{leafType}) will be plotted next to the leaf of the +dendrogram. If 'colorblock', rectangular blocks, corresponding to the color +of the cluster will be plotted, along with cluster name legend. If 'id' the +internal clusterIds value will be plotted (only appropriate if +\code{leafType="clusters"}).} + +\item{main}{passed to the \code{plot.phylo} function to set main title.} + +\item{sub}{passed to the \code{plot.phylo} function to set subtitle.} + +\item{removeOutbranch}{logical, only applicable if there are missing samples +(i.e. equal to -1 or -2), \code{leafType="samples"} and the dendrogram +for the samples was made by putting missing samples in an outbranch. In +which case, if this parameter is TRUE, the outbranch will not be plotted, +and if FALSE it will be plotted.} + +\item{legend}{logical, only applicable if \code{labelType="colorblock"}. +Passed to \code{\link{phydataplot}} in \code{\link{ape}} package that is +used to draw the color values of the clusters/samples next to the +dendrogram. Options are 'none', 'below', or 'side'} + +\item{...}{arguments passed to the \code{\link{plot.phylo}} function of +\code{ape} that plots the dendrogram.} +} +\description{ +Plots the dendrogram saved in a clusterExperiment object +} +\details{ +If \code{leafType="clusters"}, the plotting function will work best + if the clusters in the dendrogram correspond to the primary cluster. This + is because the function colors the cluster labels based on the colors of + the clusterIds of the primaryCluster +} +\examples{ +data(simData) + +#create a clustering, for 8 clusters (truth was 3) +cl <-clusterSingle(simData, clusterFunction="pam", subsample=FALSE, +sequential=FALSE, clusterDArgs=list(k=8)) + +#create dendrogram of clusters and then +# merge clusters based ondendrogram: +cl <- makeDendrogram(cl) +cl <- mergeClusters(cl,mergeMethod="adjP",cutoff=0.1,plot=FALSE) +plotDendrogram(cl) +plotDendrogram(cl,leafType="samples",whichClusters="all",labelType="colorblock") + +} diff --git a/man/plotHeatmap.Rd b/man/plotHeatmap.Rd index de225d35..13178b33 100644 --- a/man/plotHeatmap.Rd +++ b/man/plotHeatmap.Rd @@ -42,7 +42,7 @@ \item{data}{data to use to determine the heatmap. Can be a matrix, \code{\link{ClusterExperiment}} or \code{\link[SummarizedExperiment]{SummarizedExperiment}} object. The -interpretation of parameters depends on the type of the input.} +interpretation of parameters depends on the type of the input to \code{data}.} \item{isCount}{logical. Whether the data are in counts, in which case the default \code{transFun} argument is set as log2(x+1). This is simply a @@ -74,21 +74,22 @@ should be used (see details).} \item{nFeatures}{integer indicating how many features should be used (if \code{clusterFeaturesData} is 'var' or 'PCA').} -\item{visualizeData}{either a character string, indicating what form of the -data should be used for visualizing the data (i.e. for making the -color-scale), or a data.frame/matrix with same dimensions of -\code{assay(data)}.} +\item{visualizeData}{either a character string, indicating what form of the +data should be used for visualizing the data (i.e. for making the +color-scale), or a data.frame/matrix with same number of samples as +\code{assay(data)}. If a new data.frame/matrix, any character arguments to +clusterFeaturesData will be ignored.} \item{whichClusters}{character string, or vector of characters or integers, indicating what clusters should be visualized with the heatmap.} -\item{sampleData}{If input is either a \code{\link{ClusterExperiment}} or +\item{sampleData}{If input to \code{data} is either a \code{\link{ClusterExperiment}} or \code{SummarizedExperiment} object, then \code{sampleData} must index the sampleData stored as a \code{DataFrame} in \code{colData} slot of the object. Whether that data is continuous or not will be determined by the -properties of \code{colData} (no user input is needed). If input is matrix, +properties of \code{colData} (no user input is needed). If input to \code{data} is matrix, \code{sampleData} is a matrix of additional data on the samples to show -above heatmap. Unless indicated by \code{whSampleDataCont}, +above heatmap. In this case, unless indicated by \code{whSampleDataCont}, \code{sampleData} will be converted into factors, even if numeric. ``-1'' indicates the sample was not assigned to a cluster and gets color `unassignedColor' and ``-2`` gets the color 'missingColor'.} @@ -100,7 +101,7 @@ features (if FALSE, any input to clusterFeaturesData is ignored).} \item{whSampleDataCont}{Which of the \code{sampleData} columns are continuous and should not be converted to counts. \code{NULL} indicates no additional -\code{sampleData}.} +\code{sampleData}. Only used if \code{data} input is matrix.} \item{clusterSamples}{Logical as to whether to do hierarchical clustering of cells (if FALSE, any input to clusterSamplesData is ignored).} @@ -241,12 +242,19 @@ If \code{breaks} is a numeric value between 0 and 1, then 0.9 quantile will be absorbed by the upper-most color bin. This can help to reduce the visual impact of a few highly expressed genes (features). -Note that plotHeatmap calls \code{\link[NMF]{aheatmap}} under the - hood. This allows you to plot multiple heatmaps via - \code{par(mfrow=c(2,2))}, etc. However, the dendrograms do not resize if - you change the size of your plot window in an interactive session of R +Note that plotHeatmap calls \code{\link[NMF]{aheatmap}} under the + hood. This allows you to plot multiple heatmaps via + \code{par(mfrow=c(2,2))}, etc. However, the dendrograms do not resize if + you change the size of your plot window in an interactive session of R (this might be a problem for RStudio if you want to pop it out into a large - window...). + window...). Also, plotting to a pdf adds a blank page; see help pages of + \code{\link[NMF]{aheatmap}} for how to turn this off. + +If you have a factor with many levels, it is important to note that + \code{\link[NMF]{aheatmap}} does not recycle colors across factors in the + \code{sampleData}, and in fact runs out of colors and the remaining levels + get the color white. Thus if you have many factors or many levels in those + factors, you should set their colors via \code{clusterLegend}. Many arguments can be passed on to aheatmap, however, some are set internally by \code{plotHeatmap.} In particular, setting the values of diff --git a/man/transform.Rd b/man/transform.Rd index 866b6130..87b943ea 100644 --- a/man/transform.Rd +++ b/man/transform.Rd @@ -6,14 +6,14 @@ \alias{transform,ClusterExperiment-method} \title{Transform the original data in a ClusterExperiment object} \usage{ -\S4method{transform}{ClusterExperiment}(x, nPCADims = NA, nVarDims = NA, - dimReduce = "none", ignoreUnassignedVar = FALSE) +\S4method{transform}{ClusterExperiment}(`_data`, nPCADims = NA, + nVarDims = NA, dimReduce = "none", ignoreUnassignedVar = FALSE) } \arguments{ -\item{x}{a ClusterExperiment object.} +\item{_data}{a ClusterExperiment object.} -\item{nPCADims}{Numeric vector giving the number of PC dimensions to use in -PCA dimensionality reduction. If NA no PCA dimensionality reduction is +\item{nPCADims}{Numeric vector giving the number of PC dimensions to use in +PCA dimensionality reduction. If NA no PCA dimensionality reduction is done. nPCADims can also take values between (0,1) to indicate keeping the number of PCA dimensions necessary to account for that proportion of the variance.} @@ -24,9 +24,9 @@ genes) to keep, based on variance/cv/mad variability.} \item{dimReduce}{Character vector specifying the dimensionality reduction to perform, any combination of 'none', 'PCA', 'var', 'cv', and 'mad'. See details.} -\item{ignoreUnassignedVar}{logical indicating whether dimensionality reduction -via top feature variability (i.e. 'var','cv','mad') should ignore -unassigned samples in the primary clustering for calculation of the top +\item{ignoreUnassignedVar}{logical indicating whether dimensionality reduction +via top feature variability (i.e. 'var','cv','mad') should ignore +unassigned samples in the primary clustering for calculation of the top features.} } \value{ @@ -41,24 +41,24 @@ Provides the transformed data (as defined by the object), as well as dimensionality reduction. } \details{ -The data matrix defined by \code{assay(x)} is transformed based on - the transformation function defined in x. If \code{dimReduce="none"} the - transformed matrix is returned. Otherwise, the user can request - dimensionality reduction of the transformed data via \code{dimReduce}. - 'PCA' refers to PCA of the transformed data with the top nPCADims kept. +The data matrix defined by \code{assay(x)} is transformed based on + the transformation function defined in x. If \code{dimReduce="none"} the + transformed matrix is returned. Otherwise, the user can request + dimensionality reduction of the transformed data via \code{dimReduce}. + 'PCA' refers to PCA of the transformed data with the top nPCADims kept. 'var', 'cv', and 'mad' refers to keeping the top most variable features, as - defined by taking the variance, the mad, or the coefficient of variation - (respectively) across all samples. nVarDims defines how many such features + defined by taking the variance, the mad, or the coefficient of variation + (respectively) across all samples. nVarDims defines how many such features to keep for any of 'var','cv', or 'mad'; note that the number of features must be the same for all of these options (they cannot be set separately). -The PCA uses prcomp on \code{t(assay(x))} with \code{center=TRUE} - and \code{scale=TRUE} (i.e. the feature are centered and scaled), so that +The PCA uses prcomp on \code{t(assay(x))} with \code{center=TRUE} + and \code{scale=TRUE} (i.e. the feature are centered and scaled), so that it is performing PCA on the correlation matrix of the features. \code{ignoreUnassignedVar} has no impact for PCA reduction, which will always use all samples. At all times, regardless of the value of - \code{ignoreUnassignedVar}, a matrix with the same number of columns of + \code{ignoreUnassignedVar}, a matrix with the same number of columns of \code{assay(x)} (i.e. the same number of samples) will be returned. \code{dimReduce}, \code{nPCADims}, \code{nVarDims} can all be a diff --git a/tests/testthat/test_RSEC.R b/tests/testthat/test_RSEC.R index cd53d116..35d28150 100644 --- a/tests/testthat/test_RSEC.R +++ b/tests/testthat/test_RSEC.R @@ -2,6 +2,7 @@ context("RSEC") source("create_objects.R") test_that("`RSEC` works with matrix, clusterExperiment, summarizedExperiment",{ ##these examples don't do dendrogram/merge because all -1 after combineMany + ##only tests clusterMany, combineMany parts. RSEC(x=mat, isCount=FALSE,dimReduce="none",k0s=4:5,clusterFunction="tight", alphas=0.1,dendroReduce="none", subsampleArgs=list(resamp.num=5),random.seed=495 ) @@ -15,9 +16,40 @@ test_that("`RSEC` works with matrix, clusterExperiment, summarizedExperiment",{ #test rerunClusterMany argument: RSEC(rsecOut,isCount=FALSE,dimReduce="none",k0s=4:5,clusterFunction="tight", alphas=0.1,dendroReduce="none",rerunClusterMany=TRUE,subsampleArgs=list(resamp.num=5),random.seed=495) RSEC(rsecOut,isCount=FALSE,dimReduce="none",k0s=4:5,clusterFunction="tight", alphas=0.1,dendroReduce="none",rerunClusterMany=FALSE,subsampleArgs=list(resamp.num=5),random.seed=495) -#bigger example where actually goes through all the steps (above skips the merging, in particular, because no dendrogram): - RSEC(x=seSimCount, isCount=TRUE,dimReduce="none",k0s=4:5,clusterFunction="tight", alphas=0.1,dendroReduce="none", + }) + +test_that("`RSEC` works through whole series of steps",{ +#bigger example where actually goes through all the steps (above skips the merging, in particular, because no dendrogram); takes some time: +rsecOut<-RSEC(x=assay(seSimCount), isCount=TRUE,dimReduce="none", + k0s=4:5,clusterFunction="tight", alphas=0.1, + betas=0.9,dendroReduce="none",minSizes=1, subsampleArgs=list(resamp.num=5),random.seed=495 ) - + ##check same as individual steps + ceOut<-clusterMany(x=assay(seSimCount),ks=4:5,clusterFunction="tight",alphas=0.1,betas=0.9,minSizes=1, + isCount=TRUE, dimReduce="none", transFun = NULL, + sequential=TRUE,removeSil=FALSE,subsample=TRUE,silCutoff=0,distFunction=NA, + nVarDims=NA,nPCADims=NA, + clusterDArgs=NULL,subsampleArgs=list(resamp.num=5), + ncores=1,run=TRUE,seqArgs=list(verbose=FALSE),random.seed=495 + ) + expect_equal(clusterMatrix(rsecOut,whichClusters="clusterMany"),clusterMatrix(ceOut)) + + combOut<-combineMany(ceOut, proportion = 0.7,minSize = 5) + expect_equal(clusterMatrix(rsecOut,whichClusters="combineMany"),clusterMatrix(combOut,whichClusters="combineMany")) + expect_equal(coClustering(rsecOut),coClustering(combOut)) + + dendOut<-makeDendrogram(combOut,dimReduce="none",ndims=NA) + expect_equal(dendOut@dendro_clusters,rsecOut@dendro_clusters) + expect_equal(dendOut@dendro_outbranch,rsecOut@dendro_outbranch) + + #now should be the same, check all objects except dendro_samples because very big: + mergeOut<-mergeClusters(dendOut,mergeMethod = "adjP", cutoff = 0.05,isCount=TRUE) + expect_equal(dendroClusterIndex(mergeOut),dendroClusterIndex(rsecOut)) + expect_equal(mergeOut@dendro_clusters,rsecOut@dendro_clusters) + expect_equal(mergeOut@dendro_outbranch,rsecOut@dendro_outbranch) + expect_equal(coClustering(mergeOut),coClustering(rsecOut)) + expect_equal(clusterMatrix(rsecOut,whichClusters="mergeClusters"), clusterMatrix(mergeOut,whichClusters="mergeClusters")) + expect_equal(clusterTypes(rsecOut),clusterTypes(mergeOut)) }) + diff --git a/tests/testthat/test_clusterMany.R b/tests/testthat/test_clusterMany.R index 729dea6c..a583fd71 100644 --- a/tests/testthat/test_clusterMany.R +++ b/tests/testthat/test_clusterMany.R @@ -6,6 +6,10 @@ test_that("`clusterMany` works with matrix, list of data, ClusterExperiment obje clustNothing <- clusterMany(mat, ks=c(3,4),clusterFunction=c("pam","hierarchicalK","hierarchical01","tight"), subsample=FALSE, sequential=FALSE, isCount=FALSE,verbose=FALSE) + clustDF <- clusterMany(data.frame(mat), ks=c(3,4),clusterFunction=c("pam","hierarchicalK","hierarchical01","tight"), + subsample=FALSE, sequential=FALSE, + isCount=FALSE,verbose=FALSE) + expect_is(clustNothing, "ClusterExperiment") expect_is(clustNothing, "SummarizedExperiment") diff --git a/tests/testthat/test_constructor.R b/tests/testthat/test_constructor.R index 73a33d67..1b9899ec 100644 --- a/tests/testthat/test_constructor.R +++ b/tests/testthat/test_constructor.R @@ -42,7 +42,9 @@ test_that("`clusterExperiment` constructor works with matrix and expect_equal(dim(clusterMatrix(ceSim,whichClusters="all")),x) expect_equal(ncol(clusterMatrix(ceSim,whichClusters="workflow")),12) expect_equal(ncol(clusterMatrix(ceSim,whichClusters=1:3)),3) - + expect_equal(ncol(clusterMatrix(ceSim,whichClusters="dendro")),0) + dend<-makeDendrogram(ceSim) + expect_equal(ncol(clusterMatrix(dend,whichClusters="dendro")),1) }) test_that("adding clusters work as promised",{ ########## diff --git a/tests/testthat/test_dendrogram.R b/tests/testthat/test_dendrogram.R index 5082835a..f59fd9d8 100644 --- a/tests/testthat/test_dendrogram.R +++ b/tests/testthat/test_dendrogram.R @@ -1,4 +1,6 @@ context("Dendrogram") +# library(devtools) +# load_all() source("create_objects.R") test_that("`makeDendrogram` works with matrix, ClusterExperiment objects", { @@ -39,6 +41,19 @@ test_that("`makeDendrogram` preserves the colData and rowData of SE", { }) +test_that("`makeDendrogram` with dimReduce options", { + x<-makeDendrogram(ccSE,dimReduce="PCA",ndims=3) + expect_error(makeDendrogram(ccSE,dimReduce=c("PCA","var"),ndims=3)) + x2<-makeDendrogram(ccSE,dimReduce=c("PCA"),ndims=3,ignoreUnassigned=TRUE) + expect_equal(x,x2) + makeDendrogram(ccSE,dimReduce=c("var"),ndims=3,ignoreUnassigned=FALSE) + makeDendrogram(ccSE,dimReduce=c("var"),ndims=3,ignoreUnassigned=TRUE) + makeDendrogram(ccSE,dimReduce=c("cv"),ndims=3,ignoreUnassigned=FALSE) + makeDendrogram(ccSE,dimReduce=c("cv"),ndims=3,ignoreUnassigned=TRUE) + makeDendrogram(ccSE,dimReduce=c("mad"),ndims=3,ignoreUnassigned=FALSE) + makeDendrogram(ccSE,dimReduce=c("mad"),ndims=3,ignoreUnassigned=TRUE) + +}) test_that("`makeDendrogram` works with whichCluster", { x1<-makeDendrogram(ccSE,whichCluster="Cluster2") x2<-makeDendrogram(ccSE,whichCluster=2) @@ -71,25 +86,100 @@ test_that("`makeDendrogram` works with whichCluster", { expect_error(getBestFeatures(bigCE,contrastType="Dendro"),"Primary cluster does not match the cluster on which the dendrogram was made") }) -test_that("`makeDendrogram` with dimReduce options", { - x<-makeDendrogram(ccSE,dimReduce="PCA",ndims=3) - expect_error(makeDendrogram(ccSE,dimReduce=c("PCA","var"),ndims=3)) - x2<-makeDendrogram(ccSE,dimReduce=c("PCA"),ndims=3,ignoreUnassigned=TRUE) - expect_equal(x,x2) - makeDendrogram(ccSE,dimReduce=c("var"),ndims=3,ignoreUnassigned=FALSE) - makeDendrogram(ccSE,dimReduce=c("var"),ndims=3,ignoreUnassigned=TRUE) - makeDendrogram(ccSE,dimReduce=c("cv"),ndims=3,ignoreUnassigned=FALSE) - makeDendrogram(ccSE,dimReduce=c("cv"),ndims=3,ignoreUnassigned=TRUE) - makeDendrogram(ccSE,dimReduce=c("mad"),ndims=3,ignoreUnassigned=FALSE) - makeDendrogram(ccSE,dimReduce=c("mad"),ndims=3,ignoreUnassigned=TRUE) + +test_that("plotDendrogram works with outgroup", { + leg<-clusterLegend(ccSE)[[primaryClusterIndex(ccSE)]] + leg[,"name"]<-letters[1:nrow(leg)] + clusterLegend(ccSE)[[primaryClusterIndex(ccSE)]]<-leg + dend <- makeDendrogram(ccSE) + plotDendrogram(dend) + plotDendrogram(dend,show.node.label=TRUE) + plotDendrogram(dend,leafType="samples",labelType="name") + plotDendrogram(dend,leafType="samples",labelType="name",removeOutbranch=FALSE) + plotDendrogram(dend,leafType="samples",labelType="colorblock") + plotDendrogram(dend,leafType="clusters",labelType="colorblock") + plotDendrogram(dend,leafType="clusters",labelType="name") + + ## make all -2 + cl<-clusterMatrix(ccSE)[,1] + cl[1]<- -2 + dend2<-addClusters(ccSE,cl,clusterLabel="newCluster") + primaryClusterIndex(dend2)<-3 + dend2 <- makeDendrogram(dend2) + plotDendrogram(dend2,leafType="clusters",labelType="colorblock") + plotDendrogram(dend2,leafType="samples",labelType="colorblock") + plotDendrogram(dend2,leafType="samples",labelType="colorblock",removeOutbranch=FALSE) + + ## make only single sample -2 + cl<-clusterMatrix(ccSE)[,1] + cl[1]<-1 + dend3<-addClusters(ccSE,cl,clusterLabel="newCluster") + primaryClusterIndex(dend3)<-3 + dend3 <- makeDendrogram(dend3) + plotDendrogram(dend3,leafType="clusters",labelType="colorblock") + plotDendrogram(dend3,leafType="samples",labelType="colorblock") + plotDendrogram(dend3,leafType="samples",labelType="colorblock",removeOutbranch=FALSE) + + # This test breaks something. Needs to be figured out. + # ## make all -1 but two samples + # ## can't be only 1 sample because then only 1 cluster so can't make a dendrogram... + # cl<-rep(-1,length=nSamples(ccSE)) + # cl[1]<-3 + # cl[2]<-1 + # dend4<-addClusters(ccSE,cl,clusterLabel="missingCluster") + # primaryClusterIndex(dend4)<-3 + # dend4 <- makeDendrogram(dend4) + # plotDendrogram(dend4,leafType="clusters",labelType="colorblock") + # plotDendrogram(dend4,leafType="samples",labelType="colorblock") + # plotDendrogram(dend4,leafType="samples",labelType="colorblock",removeOutbranch=FALSE) + + ## make all -1 but one sample -- should get error bc only 1 cluster, can't make dendrogram; + ## in case this changes, this test will catch that need to fix plotDendrogram, which makes assumption that not possible. + cl<-rep(-1,length=nSamples(ccSE)) + cl[1]<-3 + dend5<-addClusters(ccSE,cl,clusterLabel="missingCluster") + primaryClusterIndex(dend5)<-3 + expect_error(makeDendrogram(dend5),"Only 1 cluster given. Can not make a dendrogram.") + expect_error(plotDendrogram(dend5,leafType="clusters",labelType="colorblock"),"No dendrogram is found for this ClusterExperiment Object. Run makeDendrogram first.") + + }) -test_that("plotDendrogram works", { +test_that("plotDendrogram works with whichClusters", { + leg<-clusterLegend(ccSE)[[primaryClusterIndex(ccSE)]] + leg[,"name"]<-letters[1:nrow(leg)] + clusterLegend(ccSE)[[primaryClusterIndex(ccSE)]]<-leg dend <- makeDendrogram(ccSE) - leg<-clusterLegend(ccSE)[[primaryClusterIndex(ccSE)]] - leg[,"name"]<-letters[1:nrow(leg)] - clusterLegend(ccSE)[[primaryClusterIndex(ccSE)]]<-leg + dend<-mergeClusters(dend) + plotDendrogram(dend,whichClusters="all",leafType="samples",label="colorblock") + + +}) + + +test_that("plotDendrogram works with cluster missing", { + leg<-clusterLegend(ccSE)[[primaryClusterIndex(ccSE)]] + leg[,"name"]<-letters[1:nrow(leg)] + clusterLegend(ccSE)[[primaryClusterIndex(ccSE)]]<-leg + dend <- makeDendrogram(ccSE,unassignedSamples = c("cluster")) plotDendrogram(dend) plotDendrogram(dend,show.node.label=TRUE) + plotDendrogram(dend,leafType="samples",labelType="name") + plotDendrogram(dend,leafType="samples",labelType="colorblock") + plotDendrogram(dend,leafType="clusters",labelType="colorblock") + plotDendrogram(dend,leafType="clusters",labelType="name") + + ## make all -2 + dend2<-dend + mat<-clusterMatrix(dend2) + mat[1,1]<- -2 + dend2@clusterMatrix<-mat + leg<-dend2@clusterLegend[[1]] + leg<-leg[-which(leg[,"clusterIds"]== -1),] + dend2@clusterLegend[[1]]<-leg + dend2 <- makeDendrogram(dend2,unassignedSamples = c("cluster")) + plotDendrogram(dend2,leafType="clusters",labelType="colorblock") + plotDendrogram(dend2,leafType="samples",labelType="colorblock") + }) \ No newline at end of file diff --git a/tests/testthat/test_mergeClusters.R b/tests/testthat/test_mergeClusters.R index 5bf0e6d5..9abf1acd 100644 --- a/tests/testthat/test_mergeClusters.R +++ b/tests/testthat/test_mergeClusters.R @@ -13,22 +13,27 @@ test_that("`mergeClusters` works with matrix and ClusterExperiment objects", { mergedList <- mergeClusters(x=transform(cl1), isCount=FALSE, cl=primaryCluster(cl1), dendro=clustWithDendro@dendro_clusters, - mergeMethod="adjP", plotType="mergeMethod") + mergeMethod="adjP", plotInfo="mergeMethod") - clustMerged <- mergeClusters(clustWithDendro, mergeMethod="none",plotType="all") - clustMerged <- mergeClusters(clustWithDendro, mergeMethod="none", plotType="adjP") - clustMerged <- mergeClusters(clustWithDendro, mergeMethod="none", plotType="locfdr") - clustMerged <- mergeClusters(clustWithDendro, mergeMethod="locfdr", plotType="mergeMethod") - clustMerged <- mergeClusters(clustWithDendro, mergeMethod="MB", plotType="mergeMethod") - clustMerged <- mergeClusters(clustWithDendro, mergeMethod="JC", plotType="mergeMethod") - clustMerged <- mergeClusters(clustWithDendro, mergeMethod="adjP", plotType="mergeMethod") - expect_error(clustMerged <- mergeClusters(clustWithDendro, mergeMethod="none", plotType="mergeMethod"),"can only plot merge method values if one method is selected") - clustMerged <- mergeClusters(clustWithDendro, mergeMethod="adjP", plotType="none") + clustMerged <- mergeClusters(clustWithDendro, mergeMethod="none",plotInfo="all") + clustMerged <- mergeClusters(clustWithDendro, mergeMethod="none", plotInfo="adjP") + clustMerged <- mergeClusters(clustWithDendro, mergeMethod="none", plotInfo="locfdr") + clustMerged <- mergeClusters(clustWithDendro, mergeMethod="locfdr", plotInfo="mergeMethod") + clustMerged <- mergeClusters(clustWithDendro, mergeMethod="MB", plotInfo="mergeMethod") + clustMerged <- mergeClusters(clustWithDendro, mergeMethod="JC", plotInfo="mergeMethod") + clustMerged <- mergeClusters(clustWithDendro, mergeMethod="adjP", plotInfo="mergeMethod") + expect_error(clustMerged <- mergeClusters(clustWithDendro, mergeMethod="none", plotInfo="mergeMethod"),"can only plot 'mergeMethod' results if one method is selected") + clustMerged <- mergeClusters(clustWithDendro, mergeMethod="adjP", plotInfo="none") expect_true("mergeClusters" %in% clusterTypes(clustMerged)) expect_true("mergeClusters" %in% colnames(clusterMatrix(clustMerged))) + clustMerged <- mergeClusters(clustWithDendro, mergeMethod="adjP", plotInfo="mergeMethod",leafType="samples",labelType="colorblock") + clustMerged <- mergeClusters(clustWithDendro, mergeMethod="adjP", plotInfo="mergeMethod",leafType="samples",labelType="name") + clustMerged <- mergeClusters(clustWithDendro, mergeMethod="adjP", plotInfo="mergeMethod",leafType="clusters",labelType="colorblock") + clustMerged <- mergeClusters(clustWithDendro, mergeMethod="adjP", plotInfo="mergeMethod",leafType="clusters",labelType="name") + expect_error(mergeClusters(x=transform(clustWithDendro), isCount=FALSE, cl=primaryCluster(clustWithDendro),plot="none", mergeMethod="adjP", @@ -43,6 +48,7 @@ test_that("`mergeClusters` works with matrix and ClusterExperiment objects", { expect_true("mergeClusters.1" %in% clusterTypes(clustMerged2)) expect_true(!"combineMany.1" %in% clusterTypes(clustMerged2)) expect_true(!"clusterMany.1" %in% clusterTypes(clustMerged2)) + removeClusters(clustMerged, whichRemove = "mergeClusters") }) test_that("`mergeClusters` preserves the colData and rowData of SE", { @@ -59,3 +65,24 @@ test_that("`mergeClusters` preserves the colData and rowData of SE", { expect_equal(rowData(cl),rowData(smSimSE)) }) + + +test_that("`mergeClusters` works with unassignedSamples", { + + clustWithDendro <- makeDendrogram(ceSim,unassignedSamples = c("outgroup")) + + clustMerged <- mergeClusters(clustWithDendro, mergeMethod="adjP", plotInfo="mergeMethod",leafType="samples",labelType="colorblock") + clustMerged <- mergeClusters(clustWithDendro, mergeMethod="adjP", plotInfo="mergeMethod",leafType="samples",labelType="name") + clustMerged <- mergeClusters(clustWithDendro, mergeMethod="adjP", plotInfo="mergeMethod",leafType="clusters",labelType="colorblock") + clustMerged <- mergeClusters(clustWithDendro, mergeMethod="adjP", plotInfo="mergeMethod",leafType="clusters",labelType="name") + + clustWithDendro <- makeDendrogram(ceSim,unassignedSamples = c("cluster")) + + expect_warning(mergeClusters(clustWithDendro, mergeMethod="adjP", plotInfo="mergeMethod",leafType="samples",labelType="colorblock")) + expect_warning(mergeClusters(clustWithDendro, mergeMethod="adjP", plotInfo="mergeMethod",leafType="samples",labelType="name")) + clustMerged <- mergeClusters(clustWithDendro, mergeMethod="adjP", plotInfo="mergeMethod",leafType="clusters",labelType="colorblock") + clustMerged <- mergeClusters(clustWithDendro, mergeMethod="adjP", plotInfo="mergeMethod",leafType="clusters",labelType="name") + + +}) + diff --git a/tests/testthat/test_pca.R b/tests/testthat/test_pca.R new file mode 100644 index 00000000..2aed1b99 --- /dev/null +++ b/tests/testthat/test_pca.R @@ -0,0 +1,40 @@ +context("PCA") +source("create_objects.R") + +test_that("Fast PCA gives the same results as PCA", { + + ## k = NCOL(mat) (should use regular svd) + pca_res <- prcomp(mat) + pca_res2 <- clusterExperiment:::.pca(mat, k=NCOL(mat)) + expect_equivalent(pca_res$x, pca_res2) + + pca_res <- prcomp(mat, center=FALSE) + pca_res2 <- clusterExperiment:::.pca(mat, k=NCOL(mat), center=FALSE) + expect_equivalent(pca_res$x, pca_res2) + + pca_res <- prcomp(mat, scale=TRUE) + pca_res2 <- clusterExperiment:::.pca(mat, k=NCOL(mat), scale=TRUE) + expect_equivalent(pca_res$x, pca_res2) + + pca_res <- prcomp(mat, scale=TRUE, center=FALSE) + pca_res2 <- clusterExperiment:::.pca(mat, k=NCOL(mat), scale=TRUE, center=FALSE) + expect_equivalent(pca_res$x, pca_res2) + + ## k < NCOL(mat) -- note that the signed of some components may be flipped + pca_res <- prcomp(mat) + pca_res2 <- clusterExperiment:::.pca(mat, k=10) + expect_equivalent(abs(pca_res$x[,1:10]), abs(pca_res2)) + + pca_res <- prcomp(mat, center=FALSE) + pca_res2 <- clusterExperiment:::.pca(mat, k=10, center=FALSE) + expect_equivalent(abs(pca_res$x[,1:10]), abs(pca_res2)) + + pca_res <- prcomp(mat, center=FALSE, scale=TRUE) + pca_res2 <- clusterExperiment:::.pca(mat, k=10, center=FALSE, scale=TRUE) + expect_equivalent(abs(pca_res$x[,1:10]), abs(pca_res2)) + + pca_res <- prcomp(mat, center=TRUE, scale=TRUE) + pca_res2 <- clusterExperiment:::.pca(mat, k=10, center=TRUE, scale=TRUE) + expect_equivalent(abs(pca_res$x[,1:10]), abs(pca_res2)) + +}) diff --git a/tests/testthat/test_plotting.R b/tests/testthat/test_plotting.R index b8551288..1ab19ee8 100644 --- a/tests/testthat/test_plotting.R +++ b/tests/testthat/test_plotting.R @@ -116,7 +116,8 @@ test_that("`plotHeatmap` works with matrix objects", { #check internal alignment of sampleData (alignSampleData=TRUE) is working: sampleData<-clusterMatrix(smSimCE) alList<-plotClusters(sampleData) - alCol<-alList$clusterLegend + alCol<-clusterExperiment:::.convertToAheatmap(alList$clusterLegend, names=FALSE) + #these should be same plots: x1<-plotHeatmap(data=smSimData[,alList$orderSamples],sampleData=sampleData[alList$orderSamples,1:10],clusterLegend=alCol,clusterSamples=FALSE,clusterFeatures=FALSE) x2<-plotHeatmap(data=smSimData[,alList$orderSamples],sampleData=sampleData[alList$orderSamples,1:10],alignSampleData=TRUE,clusterFeatures=FALSE,clusterSamples=FALSE) # Should get this working so proper test, but more a problem because in different order, otherwise the same. Don't want to deal with this right now. @@ -135,6 +136,8 @@ test_that("`plotHeatmap` works with matrix objects", { ##Should add tests that pass aheatmap arguments correctly. }) + + test_that("`plotHeatmap` works with ClusterExperiment and SummarizedExperiment objects", { plotHeatmap(cc) @@ -142,9 +145,11 @@ test_that("`plotHeatmap` works with ClusterExperiment and SummarizedExperiment o expect_warning(plotHeatmap(cc,whichClusters="workflow") ,"whichClusters value does not match any clusters") #there are no workflow for this one plotHeatmap(smSimCE,whichClusters="workflow",overRideClusterLimit=TRUE) - plotHeatmap(smSimCE,whichClusters="all",alignSampleData=TRUE,overRideClusterLimit=TRUE) - expect_error(plotHeatmap(smSimCE,whichClusters=1:15),"Indices in whichClusters invalid") + expect_warning(plotHeatmap(smSimCE,whichClusters=1:15),"given whichClusters value does not match any clusters") + expect_error( plotHeatmap(smSimCE,whichClusters="all", alignSampleData=TRUE, overRideClusterLimit=FALSE), "More than 10 annotations/clusterings") + plotHeatmap(smSimCE,whichClusters="all",alignSampleData=FALSE,overRideClusterLimit=TRUE) + #test sampleData expect_error(plotHeatmap(cc,sampleData="A"), "no colData for object data") @@ -157,15 +162,41 @@ test_that("`plotHeatmap` works with ClusterExperiment and SummarizedExperiment o plotHeatmap(cc) #check user setting clusterLegend + x<-palette()[1:7] + names(x)<-clusterLegend(cc)$Cluster1[,"name"] + plotHeatmap(cc,clusterLegend=list("Cluster1"=x)) + plotHeatmap(cc,clusterLegend=list("Cluster1"=palette()[1:7])) - plotHeatmap(smSimCE,sampleData="A",clusterLegend=list("A"=palette()[1:3])) - # the following works outside of the test but not inside + plotHeatmap(smSimCE,sampleData="A",clusterLegend=list("A"=palette()[1:4])) + + names(x)<-LETTERS[1:7] + expect_error( plotHeatmap(cc,clusterLegend=list("Cluster1"=x)),"do not cover all levels in the data") + x<-palette()[1:6] + names(x)<-LETTERS[1:6] + expect_error( plotHeatmap(cc,clusterLegend=list("Cluster1"=x)),"is less than the number of levels in the data") + + ######################## + ######################## + # the following checks work outside of the test but inside test_that, they hit errors # possibly issue with testthat? Not evaluating for now. - #plotHeatmap(smSimCE, sampleData="all", whichClusters="none") - - #SummarizedExperiment - plotHeatmap(smSimSE) - + ######################## + ######################## + # + # plotHeatmap(smSimCE, sampleData="all", whichClusters="none") + # + # #this test doesn't work -- for some reason, expect_warning environment hits error that don't see at the consule. + # plotHeatmap(smSimCE,whichClusters="all",alignSampleData=TRUE,overRideClusterLimit=TRUE) + # expect_warning( plotHeatmap(smSimCE, whichClusters="all", alignSampleData=TRUE, overRideClusterLimit=TRUE) + # , "More than 10 annotations/clusterings") + # + # # create some names to see if keeps names with alignSampleData=TRUE + # # only can check manually, not with testthat. + # # BUG!: doesn't work. looses their -1/-2 designation... haven't fixed yet. + # clLeg<-clusterLegend(smSimCE) + # clLeg[[1]][,"name"]<-LETTERS[1:nrow(clLeg[[1]])] + # clusterLegend(smSimCE)<-clLeg + # plotHeatmap(smSimCE, whichClusters="all", alignSampleData=TRUE,overRideClusterLimit=TRUE) + # }) test_that("`plotHeatmap` visualization choices/feature choices all work", { @@ -190,7 +221,9 @@ test_that("`plotHeatmap` visualization choices/feature choices all work", { plotHeatmap(smSimCE,visualizeData="transform",clusterFeaturesData="PCA",nFeatures=10,clusterSamplesData="hclust") plotHeatmap(smSimCE,visualizeData="transform",clusterSamplesData="dendrogramValue") - + #test works with outside dataset + plotHeatmap(smSimCE,visualizeData=assay(smSimCE)[1:10,]) + expect_error(plotHeatmap(smSimCE, visualizeData=assay(smSimCE)[,1:5])) }) test_that("`makeBlankData` works", { diff --git a/vignettes/clusterExperimentTutorial.Rmd b/vignettes/clusterExperimentTutorial.Rmd index 58bc576d..36b8fdd3 100644 --- a/vignettes/clusterExperimentTutorial.Rmd +++ b/vignettes/clusterExperimentTutorial.Rmd @@ -13,6 +13,7 @@ vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteIndexEntry{clusterExperiment Vignette} --> + ```{r GlobalOptions, results="hide", include=FALSE, cache=FALSE} knitr::opts_chunk$set(fig.align="center", cache=FALSE, cache.path = "clusterExperimentTutorial_cache/", fig.path="clusterExperimentTutorial_figure/",error=FALSE, #make it stop on error fig.width=6,fig.height=6,autodep=TRUE,out.width="600px",out.height="600px", @@ -129,7 +130,6 @@ assays(se) <- list(normalized_counts=fq) Here is our call to `clusterMany`: ```{r clusterMany} -options(getClass.msg=FALSE) #get rid of annoying messages about cache until fixed internally in R library(clusterExperiment) ce<-clusterMany(se, clusterFunction="pam",ks=5:10, isCount=TRUE,dimReduce=c("PCA","var"),nVarDims=c(100,500,1000), @@ -290,10 +290,13 @@ Now we are ready to actually merge clusters together. We now run `mergeClusters` It is useful to first run `mergeClusters` without actually creating any merged clusters so as to preview what the final clustering will be (and perhaps to help in setting the cutoff). ```{r mergeClustersPlot} -mergeClusters(ce,mergeMethod="adjP",plot="mergeMethod") +mergeClusters(ce,mergeMethod="adjP",plotInfo="mergeMethod") ``` -Then we can decide on a cutoff and visualize the resulting clustering. +Then we can decide on a cutoff and visualize the resulting clustering. + + ```{r mergeClusters} ce<-mergeClusters(ce,mergeMethod="adjP",cutoff=0.01) diff --git a/vignettes/clusterExperimentTutorial.html b/vignettes/clusterExperimentTutorial.html index 073cfad8..cf8aa2c6 100644 --- a/vignettes/clusterExperimentTutorial.html +++ b/vignettes/clusterExperimentTutorial.html @@ -10,7 +10,7 @@ - + clusterExperiment Vignette @@ -51,7 +51,7 @@

Contents

@@ -226,8 +226,7 @@

2.2 Step 0: Filtering and normali

2.3 Step 1: Clustering with clusterMany

clusterMany lets the user quickly pick between many clustering options and run all of the clusterings in one single command. In the quick start we pick a simple set of clusterings based on varying the dimensionality reduction options. The way to designate which options to vary is to give multiple values to an argument. Due to a bug in R, we need to set getClass.msg=FALSE or otherwise a slew of annoying warnings will spit out at every call; this should be fixed in the next patch to R.

Here is our call to clusterMany:

-
options(getClass.msg=FALSE) #get rid of annoying messages about cache until fixed internally in R
-library(clusterExperiment)
+
library(clusterExperiment)
 ce<-clusterMany(se, clusterFunction="pam",ks=5:10,
       isCount=TRUE,dimReduce=c("PCA","var"),nVarDims=c(100,500,1000),
       nPCADims=c(5,15,50),run=TRUE)
@@ -249,7 +248,7 @@

2.3 Step 1: Clustering with

-

+

This plot shows the samples in the columns, and different clusterings on the rows. Each sample is color coded based on its clustering for that row, where the colors have been chosen to try to match up clusters across different clusterings that show large overlap. Moreover, the samples have been ordered so that each subsequent clustering (starting at the top and going down) will try to order the samples to keep the clusters together, without rearranging the clustering blocks of the previous clustering/row.

Notice that we also added the sampleData argument in our call, indicating that we also want to visualize some information about the samples saved in the colData slot (inherited from our original fluidigm object). We chose the columns “Biological_Condition” and “Cluster2” from colData, which correspond to the original biological condition of the experiment, and the clusters reported in the original paper, respectively. These are shown at the bottom of the plot.

Notice that some samples are white. This indicates that they have the value -1, meaning they were not clustered. This is from our choices to require at least 5 samples to make a cluster.

@@ -265,7 +264,7 @@

2.3 Step 1: Clustering with -

+

We see that the order in which the clusters are given to plotClusters changes the plot greatly. There are many different options for how to run plotClusters discussed in in the detailed section on plotClusters, but for now, this plot is good enough for a quick visualization.

2.3.1 The output

@@ -302,7 +301,7 @@

2.4 Step 2: Find a consensus with ## [6,] -1 1 4
par(mar=plotCMar)
 plotClusters(ce,whichClusters="workflow")
-

+

The default result of combineMany is not usually a great choice, and certainly isn’t helpful here. The clustering from the default combineMany leaves most samples unassigned (white in the above plot). This is because the default way of combining is very conservative – it requires samples to be in the same cluster in every clustering to be assigned a cluster. This is quite stringent. We can vary this by setting the proportion argument to indicate the minimum proportion of times they should be together with other samples in the cluster they are assigned to. Explicit details on how combineMany makes these clusters are discussed in the section on combineMany.

So let’s label the one we found “combineMany, default” and then create a new one. (Making an informative label will make it easier to keep track of this particular clustering later, particularly if we make multiple calls to the workflow).

wh<-which(clusterLabels(ce)=="combineMany")
@@ -312,16 +311,16 @@ 

2.4 Step 2: Find a consensus with
## Note: no clusters specified to combine, using results from clusterMany
par(mar=plotCMar)
 plotClusters(ce,whichClusters="workflow")
-

+

We see that more clusters are detected. Those that are still not assigned a cluster from combineMany clearly vary across the clusterings as to whether the samples are clustered together or not. Varying the proportion argument will adjust whether some of the unclustered samples get added to a cluster. There is also a minSize parameter for combineMany, with the default of minSize=5. We could reduce that requirement as well and more of the unclustered samples would be grouped into a cluster. Here, we reduce it to minSize=3 (we’ll call this “combineMany,final”):

ce<-combineMany(ce,proportion=0.7,minSize=3,clusterLabel="combineMany,final")
## Note: no clusters specified to combine, using results from clusterMany
par(mar=plotCMar)
 plotClusters(ce,whichClusters="workflow",main="Min. Size=3")
-

+

We can also visualize the proportion of times these clusters were together across these clusterings (this information was made and stored in the ClusterExperiment object when we called combineMany as long as proportion value is <1):

plotCoClustering(ce)
-

+

This visualization can help in determining whether to change the value of proportion (though see combineMany for how -1 assignments affect combineMany).

@@ -332,7 +331,7 @@

2.5 Step 3: Merge clusters togeth

As an example, here we use the 500 most variable genes to make the cluster hierarchy.

ce<-makeDendrogram(ce,dimReduce="var",ndims=500)
 plotDendrogram(ce)
-

+

We can see that clusters 1 and 3 are most closely related, at least in the top 500 most variable genes.

If we look at the summary of ce, it now has ‘makeDendrogram’ marked as ‘Yes’.

ce
@@ -353,27 +352,29 @@

2.5 Step 3: Merge clusters togeth ## mergeClusters run? No

Now we are ready to actually merge clusters together. We now run mergeClusters that will go up this hierarchy and compare the level of differential expression (DE) in each pair. In other words, if we focus on the left side of the tree, DE tests are run, between 1 and 3, and between 6 and 8. If there is not enough DE between each of these (based on a cutoff that can be set by the user), then clusters 1 and 3 and/or 6 and 8 will be merged. And so on up the tree.

It is useful to first run mergeClusters without actually creating any merged clusters so as to preview what the final clustering will be (and perhaps to help in setting the cutoff).

-
mergeClusters(ce,mergeMethod="adjP",plot="mergeMethod")
+
mergeClusters(ce,mergeMethod="adjP",plotInfo="mergeMethod")
## Note: Merging will be done on ' combineMany,final ', with clustering index 1
-

-

Then we can decide on a cutoff and visualize the resulting clustering.

+

+

Then we can decide on a cutoff and visualize the resulting clustering.

+
ce<-mergeClusters(ce,mergeMethod="adjP",cutoff=0.01)
## Note: Merging will be done on ' combineMany,final ', with clustering index 1
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 2.9. Rerun with
 ## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 1.6. Rerun with
 ## increased df
-

+

par(mar=plotCMar)
 plotClusters(ce,whichClusters="workflow", sampleData=c("Biological_Condition","Cluster2"))
 plotCoClustering(ce,whichClusters=c("mergeClusters","combineMany"),
                  sampleData=c("Biological_Condition","Cluster2"),annLegend=FALSE)
-

+

Notice that mergeClusters combines clusters based on the actual values of the features, while the coClustering plot shows how often the samples clustered together. It is not uncommon that mergeClusters will merge clusters that don’t look “close” on the coClustering plot. This can be due to just the choices of the hierarchical clustering, but can also be because the two merged clusters are not often confused for each other across the clustering algorithms, yet don’t have strong differences on individual genes. This can be the case especially when the clustering is done on reduced PCA space, where an accumulation of small differences might consistently separate the samples (so the two clusters are not “confused” as to the samples), but because the differences are not strong on individual genes, mergeClusters combines them. These are ultimately different criteria.

Finally, we can do a heatmap visualizing this final step of clustering.

plotHeatmap(ce,clusterSamplesData="dendrogramValue",breaks=.99,
             sampleData=c("Biological_Condition", "Cluster1", "Cluster2"))
-

+

By choosing “dendrogramValue” for the clustering of the samples, we will be showing the clusters according to the hierarchical ordering of the clusters found by makeDendrogram. The argument breaks=0.99 means that the last color of the heatmap spectrum will be forced to be the top 1% of the data (rather than evenly spaced through the entire range of values). This can be helpful in making sure that rare extreme values in the upper range do not absorb too much space in the color spectrum. There are many more options for plotHeatmap, some of which are discussed in the section on plotHeatmap.

@@ -408,7 +409,7 @@

2.6 Step 4: Finding Features rela clusterFeaturesData=unique(pairsAll[,"IndexInOriginal"]), main="Heatmap of features w/ significant pairwise differences", breaks=.99) -

+

Notice that the samples clustered into the -1 cluster (i.e. not assigned) are clustered as an outgroup. They can also be mixed into the dendrogram (see makeDendrogram)

@@ -516,18 +517,18 @@

4.1 Plotting the clusters

par(mar=plotCMar)
 plotClusters(ce,main="Clusters from clusterMany", whichClusters="workflow", 
              axisLine=-1)
-

+

We have seen that we can get very different plots depending on how we order the clusterings, and what clusterings are included. The argument whichClusters allows the user to choose different clusterings or provide an explicit ordering of the clusterings. whichClusters can take either a single character value, or a vector of either characters or indices. If whichClusters matches either “all” or “workflow”, then the clusterings chosen are either all, or only those from the most recent calls to the workflow functions. Choosing “workflow” removes from the visualization both user-defined clusterings and also previous calls to the workflow that have since been rerun. Setting whichClusters="workflow" can be a useful if you have called a method like combineMany several times, as we did, only with different parameters. All of those runs are saved (unless eraseOld=TRUE), but you may not want to plot them.

If whichClusters is a character that is not one of these designated values, the entries should match a clusterType value (like clusterMany) or a clusterLabel value (with exact matching). Alternatively, the user can specify numeric indices corresponding to the columns of clusterMatrix that provide the order of the clusters.

par(mar=plotCMar)
 plotClusters(ce,whichClusters="clusterMany",
                main="Only Clusters from clusterMany",axisLine=-1)
-

+

We can also add to our plot (categorical) information on each of our subjects from the colData of our SummarizedExperiment object (which is also retained in our ClusterExperiment object). This can be helpful to see if the clusters correspond to other features of the samples, such as sample batches. Here we add the values from the columns “Biological_Condition” and “Cluster2” that were present in the fluidigm object and given with the published data.

par(mar=plotCMar)
 plotClusters(ce,whichClusters="workflow", sampleData=c("Biological_Condition","Cluster2"), 
                main="Workflow clusters plus other data",axisLine=-1)
-

+

4.1.1 Saving the alignment of plotClusters

plotClusters invisibly returns a ClusterExperiment object. In our earlier calls to plotCluster, this would be the same as the input object and so there is no reason to save it. However, the alignment and color assignments created by plotClusters can be requested to be saved via the resetNames, resetColors and resetOrderSamples arguments. If any of these are set to TRUE, then the object returned will be different than those of the input. Specifically, if resetColors=TRUE the colorLegend of the returned object will be changed so that the colors assigned to each cluster will be as were shown in the plot. Similarly, if resetNames=TRUE the names of the clusters will be changed to be integer values, but now the integers will be aligned to try to be the same across clusters (and therefore not consecutive integers, which is why these are saved as names for the clusters and not clusterIds). If resetOrderSamples=TRUE, then the order of the samples shown in the plot will be similarly saved in the slot orderSamples.

@@ -535,7 +536,7 @@

4.1.1 Saving the alignment of plo
par(mar=plotCMar)
 ce_temp<-plotClusters(ce,whichClusters="workflow", sampleData=c("Biological_Condition","Cluster2"), 
                main="Clusters from clusterMany, different order",axisLine=-1,resetNames=TRUE,resetColors=TRUE,resetOrderSamples=TRUE)
-

+

clusterLegend(ce_temp)[c("mergeClusters","combineMany,final")]
## $mergeClusters
 ##    clusterIds color     name
@@ -569,7 +570,7 @@ 

4.1.1 Saving the alignment of plo existingColors="all", whichClusters="clusterMany", main="clusterMany Clusters, fix the color of clusters", axisLine=-1)

-

+

@@ -578,7 +579,7 @@

4.2 Heatmap with the clusters

par(mfrow=c(1,1)) par(mar=defaultMar) plotHeatmap(ce,main="Heatmap with clusterMany") -

+

The plotHeatmap command has numerous options, in addition to those of aheatmap. plotHeatmap mainly provides additional functionality in the following areas:

  • Easy inclusion of clustering information or sample information, based on the ClusterExperiment object.
  • @@ -591,7 +592,9 @@

    4.2.1 Displaying clustering or sa

    Here we create a heatmap that shows the clusters from the workflow. Notice that we choose only the last 2 – from combineMany and mergeClusters. If we chose all “workflow” clusters it would be too many.

    whClusterPlot<-1:2
     plotHeatmap(ce,whichClusters=whClusterPlot, annLegend=FALSE)
    -

    +
    ## Warning in .local(data, ...): given whichClusters value does not match any
    +## clusters, none will be plotted
    +

    Notice we also passed the option ‘annLegend=FALSE’ to the underlying aheatmap command (with many clusterings shown, it is often not useful to have a legend for all the clusters because the legend doesn’t fit on the page!). The many detailed commands of aheatmap that are not set internally by plotHeatmap can be passed along as well.

    Like plotClusters, plotHeatmap takes an argument sampleData, which refers to columns of the colData of that object and can be included.

@@ -601,7 +604,7 @@

4.2.2 Additional options for clus
plotHeatmap(ce,clusterSamplesData="primaryCluster",
             whichClusters="primaryCluster",
             main="Heatmap with clusterMany",annLegend=FALSE)
-

+

As an improvement upon this, we can cluster the clusters into a dendrogram so that the most similar clusters will be near each other. We already did this before with our call to makeDendrogram. We haven’t done anything to change that, so the dendrogram from that call is still stored in the object. We can check this in the information shown in our object:

show(ce)
## class: ClusterExperiment 
@@ -624,7 +627,7 @@ 

4.2.2 Additional options for clus whichClusters=c("mergeClusters","combineMany"), main="Heatmap with clusterMany", sampleData=c("Biological_Condition","Cluster2"),annLegend=FALSE)

-

+

If there is not a dendrogram stored, plotHeatmap will call makeDendrogram based on the primary cluster (with the default settings of makeDendrogram); calling makeDendrogram on ce is preferred so that the user can control the choices in how it is done (which we will discuss below). For visualization purposes, the dendrogram for the combineMany cluster is preferred to that of the mergeCluster cluster, since “combineMany,final” is just a finer partition of the “mergeClusters” clustering.

@@ -786,12 +789,12 @@

5.1.3 Example changing the distan

Note on 0-1 clustering when subsample=FALSE We would note that the default values \(\alpha\) for the 0-1 clustering were set with the distance \(D\) the result of subsampling or other concensus summary in mind. In generally, subsampling creates a \(D\) matrix with high similarity for many samples who share a cluster (the proportion of times samples are seen together for well clustered samples can easily be in the .8-.95 range, or even exactly 1). For this reason the default \(\alpha\) is 0.1 which requires distances between samples in the 0.1 range or less (i.e. a similarity in the range of 0.9 or more). We show an example of the \(D\) matrix from subsampling; we make use of the clusterSingle which is the workhorse mentioned above that runs a single clustering command directly, which gives the output \(D\) from the sampling in the “coClustering” slot of ce. Note that the result is \(1-p_{ij}\) where \(p_{ij}\) is the proportion of times sample \(i\) and \(j\) clustered together.

ceSub<-clusterSingle(ce,dimReduce="mad",ndims=1000,subsample=TRUE,clusterFunction="hierarchical01",subsampleArgs=list(k=8),clusterLabel="subsamplingCluster",clusterDArgs=list(minSize=5))
 plotCoClustering(ceSub,colorScale=rev(seqPal5))
-

+

We see even here, the default of \(\alpha=0.1\) was perhaps too conservative since only two clusters came out (with size greater than 5).

The distances based on correlation calculated directly on the data, such as we created above, are often used for clustering expression data. But they are unlikely to have distances as low as seen in subsampling, even for well clustered samples. Here’s a visualization of the correlation distance matrix we defined above (using Spearman’s correlation) on the top 1000 most variable features:

dSp<-spearDist(t(transform(ce,dimReduce="mad",nVarDims=1000)))
 plotHeatmap(dSp,isSymmetric=TRUE,colorScale=rev(seqPal5))
-

+

We can see that the choice of \(\alpha\) must be much higher (and we are likely to be more sensitive to it).

Notice to calculate the distance in the above plot, we made use of the transform function applied to our ce object to get the results of dimensionality reduction. The transform function gave us a data matrix back that has been transformed, and also reduced in dimensions, like would be done in our clustering routines. transform has similar parameters as seen in clusterMany,makeDendrogram or clusterSingle and is useful when you want to manually apply something to transformed and/or dimensionality reduced data; and you can be sure you are getting the same matrix of data back that the clustering algorithms are using.

Comparing distance functions with clusterMany Now that we have defined the distances we want to compare in our global environment, we can give these to the argument “distFunction” in clusterMany. They should be given as character strings giving the names of the functions. For computational ease for this vignette, we will just choose the dimensionality reduction to be the top 1000 features based on MAD and set K=8 or \(\alpha=0.45\). We will save these results as a separate object so as to not disrupt the earlier workflow.

@@ -804,7 +807,7 @@

5.1.3 Example changing the distan clusterLabels(ceDist)<-gsub("hierarchical","hier",clusterLabels(ceDist)) par(mar=c(1.1,15.1,1.1,1.1)) plotClusters(ceDist,axisLine=-2,sampleData=c("Biological_Condition")) -

+

Notice that using the 01 methods did not give relevant results

@@ -840,19 +843,19 @@

5.2 Create a unified cluster from

As mentioned in the Quick Start section, the default option for combineMany is to only define a cluster when all of the samples are in the same clusters across all clusterings. However, this is generally too conservative and just results in most samples not being assigned to a cluster.

Instead combineMany has a parameter proportion that governs in what proportion of clusterings the samples should be together. Internally, combineMany makes a coClustering matrix \(D\). Like the \(D\) created by subsampling in clusterMany, the coClustering matrix takes on values 0-1 for the proportion of times the samples are together in the clustering. This \(D\) matrix is saved in the ce object and can be visualized with plotCoClustering (which is just a call to plotHeatmap). Recall the one we last made in the QuickStart, with our last call to combineMany (proportion=0.7 and minSize=3).

plotCoClustering(ce)
-

+

combineMany performs the clustering by running a “01” clustering algorithm on the \(D\) matrix of percentage co-clustering (the default being “hierarchical01”). The alpha argument to the 01 clustering is 1-proportion. Also passed to the clustering algorithm is the parameter minSize which sets the minimum size of a cluster.

We can also manually choose the set of clusters to use in combineMany with the argument whichClusters. Here we choose only the clusters that correspond to using dimensionality reduction using the most variable features. We also set minSize to be lower than the default of 5 to allow for smaller clusters

wh<-grep("nVAR",clusterLabels(ce))
 ce<-combineMany(ce,whichCluster=wh,proportion=0.7,minSize=3,
                 clusterLabel="combineMany,nVAR")
 plotCoClustering(ce)
-

+

We can compare to all of our other versions of combineMany. While they do not all have clusterTypes equal to “combineMany” (only the most recent call has clusterType exactly equal to “combineMany”), they all have “combineMany” as part of their clusterType, even though they have different clusterLabels (and now we’ll see that it was useful to give them different labels!)

wh<-grep("combineMany",clusterTypes(ce))
 par(mar=plotCMar)
 plotClusters(ce,whichClusters=rev(wh),axisLine=-1)
-

+

Treatment of Unclustered assignments -1 values are treated separately in the calculation. In particular, they are not considered in the calculation of percentage co-clustering – the percent co-clustering is taken only with respect to those clusterings where both samples were assigned. However, a post-processing is done to the clusters found from running the clustering on the \(D\) matrix. For each sample, the percentage of times that they were marked -1 in the clusterings is calculated. If this percentage is greater than the argument propUnassigned then the sample is marked as -1 in the clustering returned by combineMany.

Good scenarios for running combineMany Varying certain parameters result in clusterings better for combineMany than other sets of parameters. In particular, if there are huge discrepancies in the set of clusterings given to combineMany, the results will be a shattering of the samples into many small clusters. Similarly, if the number of clusters \(K\) is very different, the end result will likely be like that of the large \(K\), and how much value that is (rather than just picking the clustering with the largest \(K\)), is debatable. However, for “01” clustering algorithms or clusterings using the sequential algorithm, varying the underlying parameters \(\alpha\) or \(k_0\) often results in roughly similar clusterings across the parameters so that creating a consensus across them is highly informative.

@@ -867,7 +870,7 @@

5.3.1 makeDendrogram

Like clustering, the dendrogram can depend on what features are included from the data. The same options for clustering are available for the hierarchical clustering of the clusters, namely choices of dimensionality reduction via dimReduce and the number of dimensions via ndims.

ce<-makeDendrogram(ce,dimReduce="var",ndims=500)
 plotDendrogram(ce)
-

Notice that the plot of the dendrogram shows the hierarchy of the clusters (and color codes them according to the colors stored in colorLegend slot).

+

Notice that the plot of the dendrogram shows the hierarchy of the clusters (and color codes them according to the colors stored in colorLegend slot).

Recall that the most recent clustering made is from our call to combineMany, where we experimented with using on some of the clusterings from clusterMany, so that is our current primaryCluster:

show(ce)
## class: ClusterExperiment 
@@ -890,7 +893,7 @@ 

5.3.1 makeDendrogram

ce<-makeDendrogram(ce,dimReduce="var",ndims=500,
                    whichCluster="combineMany,final")
 plotDendrogram(ce)
-

+

Note that the clusterType of this clustering is not “combineMany”, but “combineMany.x”, where “x” indicates what iteration it was:

clusterTypes(ce)[which(clusterLabels(ce)=="combineMany,final")]
## [1] "combineMany.3"
@@ -941,7 +944,7 @@

5.3.2 Merging clusters with littl ## increased df

## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 1.6. Rerun with
 ## increased df
-

+

Now we can pick a cutoff. We’ll give it a label to keep it separate from the previous run we had made.

ce<-mergeClusters(ce,cutoff=0.05,mergeMethod="adjP",clusterLabel="mergeClusters,v2")
## Note: Merging will be done on ' combineMany,final ', with clustering index 3
@@ -949,7 +952,7 @@

5.3.2 Merging clusters with littl ## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 1.6. Rerun with
 ## increased df
-

+

ce
## class: ClusterExperiment 
 ## dim: 7069 65 
@@ -974,7 +977,7 @@ 

5.3.2 Merging clusters with littl ## increased df

## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 1.6. Rerun with
 ## increased df
-

+

ce
## class: ClusterExperiment 
 ## dim: 7069 65 
@@ -1033,7 +1036,7 @@ 

5.4.1 Designate a Final Clusterin clusterLabel="Final Clustering") par(mar=plotCMar) plotClusters(ce,whichClusters="workflow")

-

+

Note that because it is labeled as “final” it shows up automatically in “workflow” clusters in our plotClusters plot. It has also been set as our primaryCluster and has the new clusterLabel we gave it in the call to setToFinal.

This didn’t get rid of our undesired mergeClusters result that is most recent. It still shows up as “the” mergeClusters result. This might be undesired. We could remove that “mergeClusters” result with removeClusters. Alternatively, we could manually change the clusterTypes to mergeClusters.x so that it doesn’t show up as current. A cleaner way to do this would have been to first set the desired cluster (“mergeClusters.4”) to the most current iteration with setToCurrent, which would have bumped up the existing mergeClusters result to be no longer current.

@@ -1329,7 +1332,7 @@

6.1.3 Dendrogram

## [5] "X1-X6"

We can plot the dendrogram showing the node names to help make sense of which contrasts go with which nodes (plotDendrogram calls plot.phylo from the ape package and can take those arguments).

plotDendrogram(ce,show.node.label=TRUE)
-

+

@@ -1353,59 +1356,76 @@

6.4 Additional considerations

7 Session Information

This vignette was compiled under:

sessionInfo()
-
## R version 3.3.0 beta (2016-04-04 r70420)
-## Platform: x86_64-apple-darwin13.4.0 (64-bit)
-## Running under: OS X 10.10.5 (Yosemite)
+
## R version 3.4.0 (2017-04-21)
+## Platform: x86_64-apple-darwin15.6.0 (64-bit)
+## Running under: OS X El Capitan 10.11.6
+## 
+## Matrix products: default
+## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
+## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
 ## 
 ## locale:
 ## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
 ## 
 ## attached base packages:
-## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
-## [8] methods   base     
+## [1] parallel  stats4    methods   stats     graphics  grDevices utils    
+## [8] datasets  base     
 ## 
 ## other attached packages:
-##  [1] scRNAseq_1.0.0             BiocStyle_2.2.1           
-##  [3] clusterExperiment_1.1.2    BiocInstaller_1.24.0      
-##  [5] testthat_1.0.2             SummarizedExperiment_1.4.0
-##  [7] Biobase_2.34.0             GenomicRanges_1.26.4      
-##  [9] GenomeInfoDb_1.10.3        IRanges_2.8.2             
-## [11] S4Vectors_0.12.2           BiocGenerics_0.20.0       
-## [13] devtools_1.12.0           
+##  [1] clusterExperiment_1.3.0-9009 scRNAseq_1.2.0              
+##  [3] SummarizedExperiment_1.6.3   DelayedArray_0.2.7          
+##  [5] matrixStats_0.52.2           Biobase_2.36.2              
+##  [7] GenomicRanges_1.28.3         GenomeInfoDb_1.12.2         
+##  [9] IRanges_2.10.2               S4Vectors_0.14.3            
+## [11] BiocGenerics_0.22.0          BiocStyle_2.4.0             
 ## 
 ## loaded via a namespace (and not attached):
-##  [1] uuid_0.1-2         backports_1.0.5    NMF_0.20.6        
-##  [4] plyr_1.8.4         lazyeval_0.2.0     splines_3.3.0     
-##  [7] rncl_0.8.2         ggplot2_2.2.1      gridBase_0.4-7    
-## [10] digest_0.6.12      foreach_1.4.3      htmltools_0.3.5   
-## [13] viridis_0.4.0      magrittr_1.5       memoise_1.0.0     
-## [16] cluster_2.0.6      doParallel_1.0.10  limma_3.30.13     
-## [19] matrixStats_0.52.0 prettyunits_1.0.2  colorspace_1.3-2  
-## [22] dplyr_0.5.0        crayon_1.3.2       RCurl_1.95-4.8    
-## [25] jsonlite_1.3       roxygen2_6.0.1     phylobase_0.8.2   
-## [28] iterators_1.0.8    ape_4.1            registry_0.3      
-## [31] gtable_0.2.0       zlibbioc_1.20.0    XVector_0.14.1    
-## [34] kernlab_0.9-25     prabclus_2.2-6     DEoptimR_1.0-8    
-## [37] abind_1.4-5        scales_0.4.1       mvtnorm_1.0-6     
-## [40] DBI_0.6-1          rngtools_1.2.4     Rcpp_0.12.10      
-## [43] viridisLite_0.2.0  xtable_1.8-2       progress_1.1.2    
-## [46] bold_0.4.0         mclust_5.2.3       httr_1.2.1        
-## [49] RColorBrewer_1.1-2 fpc_2.1-10         modeltools_0.2-21 
-## [52] reshape_0.8.6      XML_3.98-1.6       flexmix_2.3-13    
-## [55] nnet_7.3-12        howmany_0.3-1      reshape2_1.4.2    
-## [58] munsell_0.4.3      tools_3.3.0        ade4_1.7-6        
-## [61] evaluate_0.10      stringr_1.2.0      yaml_2.1.14       
-## [64] knitr_1.15.1       robustbase_0.92-7  dendextend_1.5.2  
-## [67] nlme_3.1-131       whisker_0.3-2      taxize_0.8.4      
-## [70] xml2_1.1.1         rstudioapi_0.6     tibble_1.3.0      
-## [73] RNeXML_2.0.7       stringi_1.1.3      desc_1.1.0        
-## [76] lattice_0.20-35    trimcluster_0.1-2  Matrix_1.2-8      
-## [79] commonmark_1.2     data.table_1.10.4  bitops_1.0-6      
-## [82] R6_2.2.0           gridExtra_2.2.1    codetools_0.2-15  
-## [85] MASS_7.3-45        assertthat_0.1     MAST_1.0.5        
-## [88] pkgmaker_0.22      rprojroot_1.2      withr_1.0.2       
-## [91] locfdr_1.1-8       diptest_0.75-7     grid_3.3.0        
-## [94] tidyr_0.6.1        class_7.3-14       rmarkdown_1.4
+## [1] nlme_3.1-131 bitops_1.0-6 +## [3] bold_0.4.0 doParallel_1.0.10 +## [5] RColorBrewer_1.1-2 progress_1.1.2 +## [7] httr_1.2.1 rprojroot_1.2 +## [9] prabclus_2.2-6 tools_3.4.0 +## [11] backports_1.1.0 R6_2.2.1 +## [13] DBI_0.6-1 lazyeval_0.2.0 +## [15] colorspace_1.3-2 ade4_1.7-6 +## [17] trimcluster_0.1-2 nnet_7.3-12 +## [19] prettyunits_1.0.2 gridExtra_2.2.1 +## [21] compiler_3.4.0 xml2_1.1.1 +## [23] pkgmaker_0.22 diptest_0.75-7 +## [25] scales_0.4.1 DEoptimR_1.0-8 +## [27] mvtnorm_1.0-6 robustbase_0.92-7 +## [29] NMF_0.20.6 stringr_1.2.0 +## [31] digest_0.6.12 rmarkdown_1.5 +## [33] XVector_0.16.0 htmltools_0.3.6 +## [35] limma_3.32.2 rlang_0.1.1 +## [37] howmany_0.3-1 jsonlite_1.5 +## [39] mclust_5.3 dendextend_1.5.2 +## [41] dplyr_0.7.0 RCurl_1.95-4.8 +## [43] magrittr_1.5 modeltools_0.2-21 +## [45] GenomeInfoDbData_0.99.0 Matrix_1.2-10 +## [47] Rcpp_0.12.11 munsell_0.4.3 +## [49] ape_4.1-0.6 abind_1.4-5 +## [51] viridis_0.4.0 stringi_1.1.5 +## [53] whisker_0.3-2 yaml_2.1.14 +## [55] MASS_7.3-47 zlibbioc_1.22.0 +## [57] flexmix_2.3-14 MAST_1.2.1 +## [59] plyr_1.8.4 grid_3.4.0 +## [61] rncl_0.8.2 lattice_0.20-35 +## [63] splines_3.4.0 knitr_1.16 +## [65] uuid_0.1-2 taxize_0.8.4 +## [67] fpc_2.1-10 rngtools_1.2.4 +## [69] reshape2_1.4.2 codetools_0.2-15 +## [71] XML_3.98-1.7 evaluate_0.10 +## [73] RNeXML_2.0.7 data.table_1.10.4 +## [75] foreach_1.4.3 locfdr_1.1-8 +## [77] gtable_0.2.0 tidyr_0.6.3 +## [79] reshape_0.8.6 kernlab_0.9-25 +## [81] assertthat_0.2.0 ggplot2_2.2.1 +## [83] gridBase_0.4-7 phylobase_0.8.4 +## [85] xtable_1.8-2 class_7.3-14 +## [87] viridisLite_0.2.0 tibble_1.3.3 +## [89] iterators_1.0.8 registry_0.3 +## [91] cluster_2.0.6

References

@@ -1432,7 +1452,7 @@

References

(function () { var script = document.createElement("script"); script.type = "text/javascript"; - script.src = "https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"; + script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"; document.getElementsByTagName("head")[0].appendChild(script); })();