Merge pull request #2 from Wedge-Oxford/remove_small_clusters

Removal of small clusters
Wedge-lab · May 30, 2019 · 159653d · 159653d
2 parents 0247402 + 733a590
commit 159653d
Show file tree

Hide file tree

Showing 8 changed files with 149 additions and 26 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -3,7 +3,7 @@ Maintainer: Stefan Dentro <sd11@sanger.ac.uk>
 License: GPL-3
 Type: Package
 Title: DPClust - An R package for subclonal reconstruction of tumours using sequencing data
-Version: 2.2.6
+Version: 2.2.7
 Authors@R: c(person("David", "Wedge", role=c("aut"), email="dw9@sanger.ac.uk"),
     person("Peter", "Van Loo", role=c("aut")),
     person("Stefan","Dentro", email="sd11@sanger.ac.uk", role=c("aut", "cre")))

diff --git a/R/AssignMutations.R b/R/AssignMutations.R
@@ -456,9 +456,8 @@ multiDimensionalClustering = function(mutation.copy.number, copyNumberAdjustment
   localMins = localMins + hypercube.size
   localOptima = array(rep(range[,1],each=no.subsamples),dim(localMins))
   localOptima = localOptima + array(rep((range[,2] - range[,1])/(gridsize-1),each=no.subsamples),dim(localMins)) * localMins
-
   write.table(cbind(localOptima,above95confidence),paste(new_output_folder,"/",samplename,"_localMultidimensionalOptima_",density.smooth,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names = c(paste(samplename,subsamples,sep=""),"above95percentConfidence"))
-  write.table(localOptima[above95confidence,],paste(new_output_folder,"/",samplename,"_localHighConfidenceMultidimensionalOptima_",density.smooth,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names = paste(samplename,subsamples,sep=""))
+  write.table(localOptima[above95confidence,,drop=F],paste(new_output_folder,"/",samplename,"_localHighConfidenceMultidimensionalOptima_",density.smooth,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names = paste(samplename,subsamples,sep=""))
 
   no.optima = nrow(localOptima)
 
@@ -614,11 +613,18 @@ multiDimensionalClustering = function(mutation.copy.number, copyNumberAdjustment
     write.table(CIs,paste(new_output_folder,"/",samplename,"_confInts_",density.smooth,".txt",sep=""),col.names = paste(rep(paste(samplename,subsamples,sep=""),each=2),rep(c(".lower.CI",".upper.CI"),no.subsamples),sep=""),row.names=F,sep="\t",quote=F)
   }else{
     most.likely.cluster = rep(1,no.muts)
+    cluster.locations = matrix(NA, nrow=1, ncol=length(subsamples)+3)
+    cluster.locations[1,1] = 1
+    cluster.locations[1,2:(length(subsamples)+1)] = localOptima[1,]
+    cluster.locations[1,(length(subsamples)+2):ncol(cluster.locations)] = c(no.muts, no.muts)
+    mutation.preferences = data.frame(rep(1, no.muts))
+    #cluster.locations = as.data.frame(cluster.locations)
+    #colnames(cluster.locations) = c("cluster.no", paste(samplename, subsamples, sep=""), "estimated.no.of.mutations", "no.of.mutations.assigned")
     warning("No local optima found")
   }
 
   # Remove the estimated number of SNVs per cluster from the cluster locations table  
-  cluster.locations = as.data.frame(cluster.locations[,c(1:(length(subsamples)+1),ncol(cluster.locations))])
+  cluster.locations = as.data.frame(cluster.locations[,c(1:(length(subsamples)+1),ncol(cluster.locations)), drop=F])
   # Report only the clusters that have mutations assigned
   non_empty_clusters = cluster.locations[,ncol(cluster.locations)] > 0
   cluster.locations = cluster.locations[non_empty_clusters,,drop=F]
@@ -629,7 +635,7 @@ multiDimensionalClustering = function(mutation.copy.number, copyNumberAdjustment
   clust_order = order(rowSums(cluster.locations[,2:(ncol(cluster.locations)-1)]), decreasing=T)
   cluster.locations = cluster.locations[clust_order,,drop=F]
   cluster.locations[,1] = 1:nrow(cluster.locations)
-  mutation.preferences = mutation.preferences[,clust_order]
+  mutation.preferences = mutation.preferences[,clust_order, drop=F]
 
   # get most likely cluster
   most.likely.cluster = max.col(mutation.preferences)

diff --git a/R/DirichletProcessClustering.R b/R/DirichletProcessClustering.R
@@ -7,6 +7,9 @@
 #' @param no.iters.burn.in The number of iterations that should be discarded as burn in of the MCMC chain
 #' @param mut.assignment.type Mutation assignment type option
 #' @param num_muts_sample The number of mutations from which to start downsampling
+#' @param min_muts_cluster The minimum number of mutations required for a cluster to be kept in the final output (Default: NULL)
+#' @param min_frac_muts_cluster The minimum fraction of mutations required for a cluster to be kept in the final output (Default: 0.01)
+#' @param species Species (Default: Human)
 #' @param is.male Boolean set to TRUE when the donor is male, female otherwise
 #' @param assign_sampled_muts A boolean whether to assign mutations that have not been used for clustering due to downsampling (Default: TRUE)
 #' @param supported_chroms Vector with chromosome names from which mutations can be used (Default: NULL)
@@ -15,19 +18,30 @@
 #' @return A list containing these components
 #' @author sd11
 #' @export
-make_run_params = function(no.iters, no.iters.burn.in, mut.assignment.type, num_muts_sample, is.male, assign_sampled_muts=TRUE, supported_chroms=NULL, keep_temp_files=TRUE, generate_cluster_ordering=FALSE) {
+make_run_params = function(no.iters, no.iters.burn.in, mut.assignment.type, num_muts_sample, is.male, min_muts_cluster=NULL, min_frac_muts_cluster=0.01, species="human", assign_sampled_muts=TRUE, supported_chroms=NULL, keep_temp_files=TRUE, generate_cluster_ordering=FALSE) {
   if (is.null(supported_chroms)) {
-    # Set the expected chromosomes based on the sex
-    if (is.male) {
-      supported_chroms = as.character(c(1:22, "X", "Y"))
+    if (species=="human" | species=="Human") {
+      # Set the expected chromosomes based on the sex
+      if (is.male) {
+        supported_chroms = as.character(c(1:22, "X", "Y"))
+      } else {
+        supported_chroms = as.character(c(1:22, "X"))
+      }
     } else {
-      supported_chroms = as.character(c(1:22, "X"))
+      if (species=="mouse" | species=="Mouse") {
+        # Set the expected chromosomes based on the sex
+        if (is.male) {
+          supported_chroms = as.character(c(1:19, "X", "Y"))
+        } else {
+          supported_chroms = as.character(c(1:19, "X"))
+        }
+      }
     }
   }
 
   return(list(no.iters=no.iters, no.iters.burn.in=no.iters.burn.in, mut.assignment.type=mut.assignment.type, 
               supported_chroms=supported_chroms, num_muts_sample=num_muts_sample, assign_sampled_muts=assign_sampled_muts, keep_temp_files=keep_temp_files,
-              generate_cluster_ordering=generate_cluster_ordering))
+              generate_cluster_ordering=generate_cluster_ordering, species=species, min_muts_cluster=min_muts_cluster, min_frac_muts_cluster=min_frac_muts_cluster))
 }
 
 #' Helper function to package sample parameters
@@ -348,21 +362,33 @@ RunDP <- function(analysis_type, run_params, sample_params, advanced_params, out
     } else {
       density = NA
     }
+
+    polygon_file = file.path(outdir, paste(samplename, "_DirichletProcessplotpolygonData.txt", sep=""))
+    if (file.exists(polygon_file)) {
+      polygon.data = read.table(polygon_file, header=T)
+    } else {
+      polygon.data = NA
+    }
+
     load(file.path(outdir, paste(samplename, "_gsdata.RData", sep="")))
     write_tree = analysis_type != 'nd_dp' & analysis_type != 'reassign_muts_1d' & analysis_type != 'reassign_muts_nd'
     writeStandardFinalOutput(clustering=clustering, 
                              dataset=dataset,
                              most.similar.mut=most.similar.mut,
                              outfiles.prefix=outfiles.prefix,
+                             outdir=outdir,
                              samplename=samplename,
                              subsamplenames=subsamples,
                              GS.data=GS.data,
                              density=density,
+                             polygon.data=polygon.data,
                              no.iters=no.iters, 
                              no.iters.burn.in=no.iters.burn.in,
                              assign_sampled_muts=assign_sampled_muts,
                              write_tree=write_tree,
-                             generate_cluster_ordering=generate_cluster_ordering)
+                             generate_cluster_ordering=generate_cluster_ordering,
+                             min_muts_cluster=min_muts_cluster,
+                             min_frac_muts_cluster=min_frac_muts_cluster)
   }
 
   ####################################################################################################################
@@ -392,9 +418,13 @@ RunDP <- function(analysis_type, run_params, sample_params, advanced_params, out
     .remove_file(file.path(outdir, paste(samplename, "_localMultidimensionalOptima_0.01.txt", sep="")))
     .remove_file(file.path(outdir, paste(samplename, "_optimaInfo_0.01.txt", sep="")))
 
-    for (i in 1:(length(subsamplesrun)-1)) {
-      for (j in (i+1):length(subsamplesrun)) {
+    for (i in 1:(length(subsamples)-1)) {
+      for (j in (i+1):length(subsamples)) {
         .remove_file(file.path(outdir, paste(samplename, subsamples[i], subsamples[j], "_densityoutput.RData", sep="")))
+        .remove_file(file.path(outdir, paste(samplename, subsamples[i], subsamples[j], "_densityoutput.csv", sep="")))
+        .remove_file(file.path(outdir, pattern=glob2rx(paste(samplename, subsamples[i], subsamples[j], "*densityData1.csv", sep="")), full.names=T))
+        density_csv_files = list.files(outdir, pattern=glob2rx(paste(samplename, subsamples[i], subsamples[j], "*vals.csv", sep="")), full.names=T)
+        for (infile in density_csv_files) { .remove_file(infile) }
       }
     }
 
@@ -409,20 +439,86 @@ RunDP <- function(analysis_type, run_params, sample_params, advanced_params, out
 #' @param dataset The dataset that went into clustering
 #' @param most.similar.mut Vector containing for each non-sampled mutation its most similar sampled mutation. The non-sampled mutation will be assigned to the same cluster
 #' @param outfiles.prefix A prefix for the filenames
+#' @param outdir Output directory where the replot of the 1D method is to be stored if clusters are removed due to being too small
 #' @param samplename Overall samplename
 #' @param subsamplenames Samplenames of the different timepoints
 #' @param GS.data MCMC output
 #' @param density Posterior density estimate across MCMC iterations
+#' @param polygon.data 1D confidence interval, used for plotting the 1D method (set to NA for multi-D method)
 #' @param no.iters Number of total iterations
 #' @param no.iters.burn.in Number of iterations to be used as burn-in
+#' @param min_muts_cluster The minimum number of mutations required for a cluster to be kept in the final output
+#' @param min_frac_muts_cluster The minimum fraction of mutations required for a cluster to be kept in the final
 #' @param assign_sampled_muts Boolean whether to assign the non-sampled mutations (Default: TRUE)
 #' @param write_tree Boolean whether to write a tree to file. Not all clustering methods return a tree (Default: FALSE)
 #' @param generate_cluster_ordering Boolean specifying whether a possible cluster ordering should be determined (Default: FALSE)
 #' @param no.samples.cluster.order Number of mutations to sample (with replacement) to classify pairs of clusters into parent-offspring or siblings (Default: 1000)
 #' @author sd11
-writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfiles.prefix, samplename, subsamplenames, GS.data, density, no.iters, no.iters.burn.in, assign_sampled_muts=T, write_tree=F, generate_cluster_ordering=F, no.samples.cluster.order=1000) {
+writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfiles.prefix, outdir, samplename, subsamplenames, GS.data, density, polygon.data, no.iters, no.iters.burn.in, min_muts_cluster, min_frac_muts_cluster, assign_sampled_muts=T, write_tree=F, generate_cluster_ordering=F, no.samples.cluster.order=1000) {
   num_samples = ncol(dataset$mutCount)
 
+  ########################################################################
+  # Check for too small clusters
+  ########################################################################
+  if (nrow(clustering$cluster.locations) > 1 & (min_muts_cluster!=-1 | min_frac_muts_cluster!=-1)) {
+    if (min_muts_cluster!=-1 & min_frac_muts_cluster!=-1) print("Found entries for both min_muts_cluster and min_frac_muts_cluster, used which yielded the largest number")
+
+    # min_muts_cluster = ifelse(is.null(min_muts_cluster), -1, min_muts_cluster)
+    min_frac_muts_cluster = ifelse(min_frac_muts_cluster==-1, -1, min_frac_muts_cluster*nrow(dataset$mutCount))
+
+    # use min_muts_cluster variable further down, overwrite with min_frac_muts_cluster
+    if (min_frac_muts_cluster > min_muts_cluster) {
+      min_muts_cluster = min_frac_muts_cluster
+    }
+
+    clusters_to_remove = clustering$cluster.locations[,num_samples+2] < min_muts_cluster
+
+    if (sum(clusters_to_remove) > 0 & sum(clusters_to_remove) < length(clusters_to_remove)) {
+      # remove clusters that are too small
+      clusterids_to_remove = clustering$cluster.locations[clusters_to_remove,1]
+      new_cluster.locations = clustering$cluster.locations[!clustering$cluster.locations[,1] %in% clusterids_to_remove,,drop=F]
+      new_all.assignment.likelihoods = clustering$all.assignment.likelihoods[,!clusters_to_remove, drop=F]
+      new_best.assignment.likelihoods = clustering$best.assignment.likelihoods
+      new_best.node.assignments = clustering$best.node.assignments
+      # reset best likelihoods and hard assignments for mutations assigned to the removed cluster(s)
+      new_best.assignment.likelihoods[new_best.node.assignments %in% clusterids_to_remove] = NA
+      new_best.node.assignments[new_best.node.assignments %in% clusterids_to_remove] = NA
+
+      clustering$cluster.locations = new_cluster.locations
+      clustering$all.assignment.likelihoods = new_all.assignment.likelihoods
+      clustering$best.node.assignments = new_best.node.assignments
+      clustering$best.assignment.likelihoods = new_best.assignment.likelihoods
+
+      # if 1D clustering, then replot without the removed cluster
+      if (ncol(dataset$mutCount)==1) {
+        # Old plot
+        plot1D(density=density, 
+               polygon.data=polygon.data[,1], 
+               pngFile=paste(outdir, "/", samplename, "_DirichletProcessplot_with_cluster_locations.png", sep=""), 
+               density.from=0, 
+               x.max=1.5, 
+               mutationCopyNumber=dataset$mutation.copy.number, 
+               no.chrs.bearing.mut=dataset$copyNumberAdjustment,
+               samplename=samplename,
+               cluster.locations=clustering$cluster.locations,
+               mutation.assignments=clustering$best.node.assignments)
+
+        # New plot
+        plot1D_2(density=density, 
+                 polygon.data=polygon.data[,1],
+                 pngFile=paste(outdir, "/", samplename, "_DirichletProcessplot_with_cluster_locations_2.png", sep=""), 
+                 density.from=0, 
+                 x.max=1.5, 
+                 mutationCopyNumber=dataset$mutation.copy.number, 
+                 no.chrs.bearing.mut=dataset$copyNumberAdjustment,
+                 samplename=samplename,
+                 cluster.locations=clustering$cluster.locations,
+                 mutation.assignments=clustering$best.node.assignments,
+                 mutationTypes=dataset$mutationType)
+      }
+    }
+  }
+
   if (generate_cluster_ordering) {
     ########################################################################
     # Before doing anything else, calculate confidence intervals on the cluster locations using only the mutations used during clustering

diff --git a/R/PlotDensities.R b/R/PlotDensities.R
@@ -64,7 +64,7 @@ plot1D = function(density, polygon.data, pngFile=NA, density.from=0, x.max=NA, y
   if(!is.null(cluster.locations) & !is.null(mutation.assignments)) {
     assign.counts = table(mutation.assignments)
 
-    for (cluster in unique(mutation.assignments)) {
+    for (cluster in unique(mutation.assignments[!is.na(mutation.assignments)])) {
       x = cluster.locations[cluster.locations[,1]==cluster, 2]
       lines(x=c(x, x), y=c(0, y.max), col="black", lwd=3)
       text(paste("Cluster",cluster, sep=" "), x=x+0.01, y=(9/10)*y.max, adj=c(0,0), cex=2)

diff --git a/README.md b/README.md
@@ -35,12 +35,12 @@ cd dpclust/inst/example
 
 Run DPClust on provided example data. After checking out this repository, build the image:
 ```
-docker build -t dpclust:2.2.6 .
+docker build -t dpclust:2.2.7 .
 ```
 
 Run DPClust as follows
 ```
-docker run -it -v `pwd`:/mnt/output/ dpclust:2.2.6 /opt/dpclust/example/run_docker.sh
+docker run -it -v `pwd`:/mnt/output/ dpclust:2.2.7 /opt/dpclust/example/run_docker.sh
 ```
 
 ## Input description