From 7ecec055ae2b9470a9cabf0a62a957a66afb7e94 Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Tue, 28 May 2019 09:25:23 +0100
Subject: [PATCH 01/11] Fix issue when only a single density optimum with
 mutations is found

---
 R/AssignMutations.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/AssignMutations.R b/R/AssignMutations.R
index 1b42617..9ed38d9 100755
--- a/R/AssignMutations.R
+++ b/R/AssignMutations.R
@@ -91,7 +91,7 @@ oneDimensionalClustering <- function(samplename, subclonal.fraction, GS.data, de
     clust_order = order(cluster_locations[,2], decreasing=T)
     cluster_locations = cluster_locations[clust_order,,drop=F]
     cluster_locations[,1] = 1:nrow(cluster_locations)
-    mutation.preferences = mutation.preferences[,clust_order]
+    mutation.preferences = mutation.preferences[,clust_order, drop=F]
     
     # get most likely cluster
     most.likely.cluster = max.col(mutation.preferences)

From 0247402e399c1a282e9858af37780832b2f6abd9 Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Tue, 28 May 2019 09:47:27 +0100
Subject: [PATCH 02/11] Adding in making of 2D plots and removal of created
 density files, if requested to remove temp output

---
 R/DirichletProcessClustering.R | 38 ++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/R/DirichletProcessClustering.R b/R/DirichletProcessClustering.R
index b6a338a..4f875b3 100755
--- a/R/DirichletProcessClustering.R
+++ b/R/DirichletProcessClustering.R
@@ -392,6 +392,12 @@ RunDP <- function(analysis_type, run_params, sample_params, advanced_params, out
     .remove_file(file.path(outdir, paste(samplename, "_localMultidimensionalOptima_0.01.txt", sep="")))
     .remove_file(file.path(outdir, paste(samplename, "_optimaInfo_0.01.txt", sep="")))
     
+    for (i in 1:(length(subsamplesrun)-1)) {
+      for (j in (i+1):length(subsamplesrun)) {
+        .remove_file(file.path(outdir, paste(samplename, subsamples[i], subsamples[j], "_densityoutput.RData", sep="")))
+      }
+    }
+    
     nd_density_files = list.files(outdir, pattern="_2D_binomial_")
     if (length(nd_density_files) > 0) { file.remove(nd_density_files) }
   }
@@ -697,24 +703,24 @@ DirichletProcessClustering <- function(mutCount, WTCount, totalCopyNumber, copyN
     ########################
     # Plot density and Assign mutations to clusters - nD
     ########################
-    print("Assigning mutations to clusters...")
-    # print("Estimating density between pairs of samples...")
-    # for (i in 1:(length(subsamplesrun)-1)) {
-    #   for (j in (i+1):length(subsamplesrun)) {
-    #     print(paste("Samples", subsamplesrun[i], "and", subsamplesrun[j], sep=" "))
-    #     imageFile = file.path(output_folder, paste(samplename,subsamplesrun[i],subsamplesrun[j],"_iters",no.iters,"_concParam",conc_param,"_clusterWidth",1/cluster_conc,"_2D_binomial.png",sep=""))
-    #     density = Gibbs.subclone.density.est(mutation.copy.number[,c(i,j)]/copyNumberAdjustment[,c(i,j)],
-    #                                          GS.data,
-    #                                          imageFile, 
-    #                                          post.burn.in.start = no.iters.burn.in, 
-    #                                          post.burn.in.stop = no.iters, 
-    #                                          samplenames = paste(samplename,subsamplesrun[c(i,j)],sep=""),
-    #                                          indices=c(i,j))  
-    #     save(file=file.path(output_folder, paste(samplename, subsamplesrun[i], subsamplesrun[j], "_densityoutput.RData", sep="")), GS.data, density)
-    #   }
-    # }
+    print("Estimating density between pairs of samples...")
+    for (i in 1:(length(subsamplesrun)-1)) {
+      for (j in (i+1):length(subsamplesrun)) {
+        print(paste("Samples", subsamplesrun[i], "and", subsamplesrun[j], sep=" "))
+        imageFile = file.path(output_folder, paste(samplename,subsamplesrun[i],subsamplesrun[j],"_iters",no.iters,"_concParam",conc_param,"_clusterWidth",1/cluster_conc,"_2D_binomial.png",sep=""))
+        density = Gibbs.subclone.density.est(mutation.copy.number[,c(i,j)]/copyNumberAdjustment[,c(i,j)],
+                                             GS.data,
+                                             imageFile,
+                                             post.burn.in.start = no.iters.burn.in,
+                                             post.burn.in.stop = no.iters,
+                                             samplenames = paste(samplename,subsamplesrun[c(i,j)],sep=""),
+                                             indices=c(i,j))
+        save(file=file.path(output_folder, paste(samplename, subsamplesrun[i], subsamplesrun[j], "_densityoutput.RData", sep="")), GS.data, density)
+      }
+    }
     
     # Assign mutations to clusters using one of the different assignment methods
+    print("Assigning mutations to clusters...")
     opts = list(samplename=samplename, subsamplenames=subsamplesrun, no.iters=no.iters, no.iters.burn.in=no.iters.burn.in, no.iters.post.burn.in=no.iters-no.iters.burn.in, outdir=output_folder)
     if (mut.assignment.type == 1) {
       consClustering = multiDimensionalClustering(mutation.copy.number=mutation.copy.number, 

From d812df4a3d8a724df4a5c23471e2a2bed6dbd69f Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Tue, 28 May 2019 11:27:29 +0100
Subject: [PATCH 03/11] Removal of small clusters code added

---
 R/DirichletProcessClustering.R  | 73 +++++++++++++++++++++++++++++----
 inst/example/dpclust_pipeline.R |  9 +++-
 2 files changed, 72 insertions(+), 10 deletions(-)

diff --git a/R/DirichletProcessClustering.R b/R/DirichletProcessClustering.R
index 4f875b3..111e8ee 100755
--- a/R/DirichletProcessClustering.R
+++ b/R/DirichletProcessClustering.R
@@ -7,6 +7,9 @@
 #' @param no.iters.burn.in The number of iterations that should be discarded as burn in of the MCMC chain
 #' @param mut.assignment.type Mutation assignment type option
 #' @param num_muts_sample The number of mutations from which to start downsampling
+#' @param min_muts_cluster The minimum number of mutations required for a cluster to be kept in the final output (Default: NULL)
+#' @param min_frac_muts_cluster The minimum fraction of mutations required for a cluster to be kept in the final output (Default: 0.01)
+#' @param species Species (Default: Human)
 #' @param is.male Boolean set to TRUE when the donor is male, female otherwise
 #' @param assign_sampled_muts A boolean whether to assign mutations that have not been used for clustering due to downsampling (Default: TRUE)
 #' @param supported_chroms Vector with chromosome names from which mutations can be used (Default: NULL)
@@ -15,19 +18,30 @@
 #' @return A list containing these components
 #' @author sd11
 #' @export
-make_run_params = function(no.iters, no.iters.burn.in, mut.assignment.type, num_muts_sample, is.male, assign_sampled_muts=TRUE, supported_chroms=NULL, keep_temp_files=TRUE, generate_cluster_ordering=FALSE) {
+make_run_params = function(no.iters, no.iters.burn.in, mut.assignment.type, num_muts_sample, is.male, min_muts_cluster=NULL, min_frac_muts_cluster=0.01, species="human", assign_sampled_muts=TRUE, supported_chroms=NULL, keep_temp_files=TRUE, generate_cluster_ordering=FALSE) {
   if (is.null(supported_chroms)) {
-    # Set the expected chromosomes based on the sex
-    if (is.male) {
-      supported_chroms = as.character(c(1:22, "X", "Y"))
+    if (species=="human" | species=="Human") {
+      # Set the expected chromosomes based on the sex
+      if (is.male) {
+        supported_chroms = as.character(c(1:22, "X", "Y"))
+      } else {
+        supported_chroms = as.character(c(1:22, "X"))
+      }
     } else {
-      supported_chroms = as.character(c(1:22, "X"))
+      if (species=="mouse" | species=="Mouse") {
+        # Set the expected chromosomes based on the sex
+        if (is.male) {
+          supported_chroms = as.character(c(1:19, "X", "Y"))
+        } else {
+          supported_chroms = as.character(c(1:19, "X"))
+        }
+      }
     }
   }
   
   return(list(no.iters=no.iters, no.iters.burn.in=no.iters.burn.in, mut.assignment.type=mut.assignment.type, 
               supported_chroms=supported_chroms, num_muts_sample=num_muts_sample, assign_sampled_muts=assign_sampled_muts, keep_temp_files=keep_temp_files,
-              generate_cluster_ordering=generate_cluster_ordering))
+              generate_cluster_ordering=generate_cluster_ordering, species=species, min_muts_cluster=min_muts_cluster, min_frac_muts_cluster=min_frac_muts_cluster))
 }
 
 #' Helper function to package sample parameters
@@ -362,7 +376,9 @@ RunDP <- function(analysis_type, run_params, sample_params, advanced_params, out
                              no.iters.burn.in=no.iters.burn.in,
                              assign_sampled_muts=assign_sampled_muts,
                              write_tree=write_tree,
-                             generate_cluster_ordering=generate_cluster_ordering)
+                             generate_cluster_ordering=generate_cluster_ordering,
+                             min_muts_cluster=min_muts_cluster,
+                             min_frac_muts_cluster=min_frac_muts_cluster)
   }
   
   ####################################################################################################################
@@ -415,14 +431,55 @@ RunDP <- function(analysis_type, run_params, sample_params, advanced_params, out
 #' @param density Posterior density estimate across MCMC iterations
 #' @param no.iters Number of total iterations
 #' @param no.iters.burn.in Number of iterations to be used as burn-in
+#' @param min_muts_cluster The minimum number of mutations required for a cluster to be kept in the final output
+#' @param min_frac_muts_cluster The minimum fraction of mutations required for a cluster to be kept in the final
 #' @param assign_sampled_muts Boolean whether to assign the non-sampled mutations (Default: TRUE)
 #' @param write_tree Boolean whether to write a tree to file. Not all clustering methods return a tree (Default: FALSE)
 #' @param generate_cluster_ordering Boolean specifying whether a possible cluster ordering should be determined (Default: FALSE)
 #' @param no.samples.cluster.order Number of mutations to sample (with replacement) to classify pairs of clusters into parent-offspring or siblings (Default: 1000)
 #' @author sd11
-writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfiles.prefix, samplename, subsamplenames, GS.data, density, no.iters, no.iters.burn.in, assign_sampled_muts=T, write_tree=F, generate_cluster_ordering=F, no.samples.cluster.order=1000) {
+writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfiles.prefix, samplename, subsamplenames, GS.data, density, no.iters, no.iters.burn.in, min_muts_cluster, min_frac_muts_cluster, assign_sampled_muts=T, write_tree=F, generate_cluster_ordering=F, no.samples.cluster.order=1000) {
   num_samples = ncol(dataset$mutCount)
   
+  ########################################################################
+  # Check for too small clusters
+  ########################################################################
+  if (nrow(clustering$cluster.locations) > 1 & (!is.null(min_muts_cluster) | !is.null(min_frac_muts_cluster))) {
+    if (!is.null(min_muts_cluster) & !is.null(min_frac_muts_cluster)) print("Found entries for both min_muts_cluster and min_frac_muts_cluster, used which yielded the largest number")
+    
+    min_muts_cluster = ifelse(is.null(min_muts_cluster), -1, min_muts_cluster)
+    min_frac_muts_cluster = ifelse(is.null(min_frac_muts_cluster), -1, min_frac_muts_cluster*nrow(dataset$mutCount))
+    
+    # use min_muts_cluster variable further down, overwrite with min_frac_muts_cluster
+    if (min_frac_muts_cluster > min_muts_cluster) {
+      min_muts_cluster = min_frac_muts_cluster
+    }
+    
+    clusters_to_remove = clustering$cluster.locations[,num_samples+2] < min_muts_cluster
+    
+    if (sum(clusters_to_remove) > 0) {
+      # remove clusters that are too small
+      clusterids_to_remove = clustering$cluster.locations[clusters_to_remove,1]
+      new_cluster.locations = clustering$cluster.locations[!clustering$cluster.locations[,1] %in% clusterids_to_remove,,drop=F]
+      new_all.assignment.likelihoods = clustering$all.assignment.likelihoods[,!clusters_to_remove, drop=F]
+      new_best.assignment.likelihoods = clustering$best.assignment.likelihoods
+      new_best.node.assignments = clustering$best.node.assignments
+      # reset best likelihoods and hard assignments for mutations assigned to the removed cluster(s)
+      new_best.assignment.likelihoods[new_best.node.assignments %in% clusterids_to_remove] = NA
+      new_best.node.assignments[new_best.node.assignments %in% clusterids_to_remove] = NA
+      
+      clustering$cluster.locations = new_cluster.locations
+      clustering$all.assignment.likelihoods = new_all.assignment.likelihoods
+      clustering$best.node.assignments = new_best.node.assignments
+      clustering$best.assignment.likelihoods = new_best.node.assignments
+      
+      # if 1D clustering, then replot without the removed cluster
+      if (ncol(dataset$mutCount)==1) {
+        # todo
+      }
+    }
+  }
+    
   if (generate_cluster_ordering) {
     ########################################################################
     # Before doing anything else, calculate confidence intervals on the cluster locations using only the mutations used during clustering
diff --git a/inst/example/dpclust_pipeline.R b/inst/example/dpclust_pipeline.R
index ad78fd1..668fc3a 100755
--- a/inst/example/dpclust_pipeline.R
+++ b/inst/example/dpclust_pipeline.R
@@ -19,6 +19,8 @@ option_list = list(
   make_option(c("--iterations"), type="integer", default=2000, help="Number of iterations to run the MCMC chain", metavar="character"),
   make_option(c("--burnin"), type="integer", default=1000, help="Number of iterations to discard as burnin", metavar="character"),
   make_option(c("--mut_assignment_type"), type="integer", default=1, help="Mutation assignment method", metavar="character"),
+  make_option(c("--min_muts_cluster"), type="integer", default=-1, help="Minimum number of mutations per cluster required for it to be kept in the final output, set to -1 to disable (default), see also --min_frac_muts_cluster", metavar="character"),
+  make_option(c("--min_frac_muts_cluster"), type="integer", default=0.01, help="Minimum fraction of mutations per cluster required for it to be kept in the final output, set to -1 to disable, see also --min_muts_cluster", metavar="character"),
   make_option(c("--num_muts_sample"), type="integer", default=50000, help="Number of mutations from which downsampling starts", metavar="character"),
   make_option(c("--bin_size"), type="double", default=NULL, help="Binsize to use when constructing multi-dimensional density - only used when number of samples > 1", metavar="character"),
   make_option(c("--seed"), type="integer", default=123, help="Provide a seed", metavar="character"),
@@ -41,6 +43,8 @@ bin.size = opt$bin_size
 seed = opt$seed
 assign_sampled_muts = opt$assign_sampled_muts
 keep_temp_files = opt$keep_temp_files
+min_muts_cluster = ifelse(opt$min_muts_cluster==-1, NULL, opt$min_muts_cluster) # set absolute minimum number of mutations per cluster
+min_frac_muts_cluster = ifelse(opt$min_frac_muts_cluster==-1, NULL, opt$min_frac_muts_cluster) # set proportional minimum number of mutations per cluster
 
 if (is.null(outdir)) { outdir = getwd() }
 
@@ -54,6 +58,7 @@ is.male = T
 sample.snvs.only = T # Perform sampling on just the SNVs and not on CNAs
 remove.snvs = F # Clear all SNVs, to perform clustering on CNAs only - This needs a better solution
 generate_cluster_ordering = F
+species = "human" # mouse also supported, just changes the chromosomes on which mutations are kept, has not effect on functionality
 
 # Cocluster CNA parameters
 co_cluster_cna = F
@@ -104,12 +109,12 @@ print("")
 #####################################################################################
 # Create output path
 #####################################################################################
-outdir = paste(outdir, "/", samplename, "_DPoutput_", no.iters,"iters_", no.iters.burn.in, "burnin_seed", seed, "/", sep="")
+outdir = file.path(outdir, paste(samplename, "_DPoutput_", no.iters,"iters_", no.iters.burn.in, "burnin_seed", seed, "/", sep=""))
 
 #####################################################################################
 # Setup parameters
 #####################################################################################
-run_params = make_run_params(no.iters, no.iters.burn.in, mut.assignment.type, num_muts_sample, is.male=is.male, assign_sampled_muts=assign_sampled_muts, keep_temp_files=keep_temp_files, generate_cluster_ordering=generate_cluster_ordering)
+run_params = make_run_params(no.iters, no.iters.burn.in, mut.assignment.type, num_muts_sample, is.male=is.male, min_muts_cluster=min_muts_cluster, min_frac_muts_cluster=min_frac_muts_cluster, species=species, assign_sampled_muts=assign_sampled_muts, keep_temp_files=keep_temp_files, generate_cluster_ordering=generate_cluster_ordering)
 sample_params = make_sample_params(datafiles, cellularity, is.male, samplename, subsamples, mutphasingfiles)
 advanced_params = make_advanced_params(seed)
 

From 6efd33c3f6e7acf1ffa32c862614e355580b7555 Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Tue, 28 May 2019 12:16:01 +0100
Subject: [PATCH 04/11] Allow for replot of 1D figures after a cluster is
 removed

---
 R/DirichletProcessClustering.R | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/R/DirichletProcessClustering.R b/R/DirichletProcessClustering.R
index 111e8ee..d346d1c 100755
--- a/R/DirichletProcessClustering.R
+++ b/R/DirichletProcessClustering.R
@@ -362,16 +362,21 @@ RunDP <- function(analysis_type, run_params, sample_params, advanced_params, out
     } else {
       density = NA
     }
+    polygon_file = file.path(outdir, paste(samplename, "_DirichletProcessplotpolygonData.txt", sep=""))
+    polygon.data = ifelse(file.exists(polygon_file), read.table(polygon_file, header=T), NA)
+    
     load(file.path(outdir, paste(samplename, "_gsdata.RData", sep="")))
     write_tree = analysis_type != 'nd_dp' & analysis_type != 'reassign_muts_1d' & analysis_type != 'reassign_muts_nd'
     writeStandardFinalOutput(clustering=clustering, 
                              dataset=dataset,
                              most.similar.mut=most.similar.mut,
                              outfiles.prefix=outfiles.prefix,
+                             outdir=outdir,
                              samplename=samplename,
                              subsamplenames=subsamples,
                              GS.data=GS.data,
                              density=density,
+                             polygon.data=polygon.data,
                              no.iters=no.iters, 
                              no.iters.burn.in=no.iters.burn.in,
                              assign_sampled_muts=assign_sampled_muts,
@@ -425,10 +430,12 @@ RunDP <- function(analysis_type, run_params, sample_params, advanced_params, out
 #' @param dataset The dataset that went into clustering
 #' @param most.similar.mut Vector containing for each non-sampled mutation its most similar sampled mutation. The non-sampled mutation will be assigned to the same cluster
 #' @param outfiles.prefix A prefix for the filenames
+#' @param outdir Output directory where the replot of the 1D method is to be stored if clusters are removed due to being too small
 #' @param samplename Overall samplename
 #' @param subsamplenames Samplenames of the different timepoints
 #' @param GS.data MCMC output
 #' @param density Posterior density estimate across MCMC iterations
+#' @param polygon.data 1D confidence interval, used for plotting the 1D method (set to NA for multi-D method)
 #' @param no.iters Number of total iterations
 #' @param no.iters.burn.in Number of iterations to be used as burn-in
 #' @param min_muts_cluster The minimum number of mutations required for a cluster to be kept in the final output
@@ -438,7 +445,7 @@ RunDP <- function(analysis_type, run_params, sample_params, advanced_params, out
 #' @param generate_cluster_ordering Boolean specifying whether a possible cluster ordering should be determined (Default: FALSE)
 #' @param no.samples.cluster.order Number of mutations to sample (with replacement) to classify pairs of clusters into parent-offspring or siblings (Default: 1000)
 #' @author sd11
-writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfiles.prefix, samplename, subsamplenames, GS.data, density, no.iters, no.iters.burn.in, min_muts_cluster, min_frac_muts_cluster, assign_sampled_muts=T, write_tree=F, generate_cluster_ordering=F, no.samples.cluster.order=1000) {
+writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfiles.prefix, outdir, samplename, subsamplenames, GS.data, density, polygon.data, no.iters, no.iters.burn.in, min_muts_cluster, min_frac_muts_cluster, assign_sampled_muts=T, write_tree=F, generate_cluster_ordering=F, no.samples.cluster.order=1000) {
   num_samples = ncol(dataset$mutCount)
   
   ########################################################################
@@ -471,11 +478,17 @@ writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfi
       clustering$cluster.locations = new_cluster.locations
       clustering$all.assignment.likelihoods = new_all.assignment.likelihoods
       clustering$best.node.assignments = new_best.node.assignments
-      clustering$best.assignment.likelihoods = new_best.node.assignments
+      clustering$best.assignment.likelihoods = new_best.assignment.likelihoods
       
       # if 1D clustering, then replot without the removed cluster
       if (ncol(dataset$mutCount)==1) {
-        # todo
+        replot_1D(outdir=outdir,
+                  outfiles.prefix=outfiles.prefix,
+                  samplename=samplename, 
+                  dataset=dataset, 
+                  clustering=clustering, 
+                  density=density, 
+                  polygon.data=polygon.data)
       }
     }
   }

From d2ec9c695e112200478459e323f5391a69989760 Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Tue, 28 May 2019 13:18:14 +0100
Subject: [PATCH 05/11] Change handling of default values of minimum cluster
 size

---
 R/DirichletProcessClustering.R  | 2 +-
 inst/example/dpclust_pipeline.R | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/R/DirichletProcessClustering.R b/R/DirichletProcessClustering.R
index d346d1c..398cef4 100755
--- a/R/DirichletProcessClustering.R
+++ b/R/DirichletProcessClustering.R
@@ -452,7 +452,7 @@ writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfi
   # Check for too small clusters
   ########################################################################
   if (nrow(clustering$cluster.locations) > 1 & (!is.null(min_muts_cluster) | !is.null(min_frac_muts_cluster))) {
-    if (!is.null(min_muts_cluster) & !is.null(min_frac_muts_cluster)) print("Found entries for both min_muts_cluster and min_frac_muts_cluster, used which yielded the largest number")
+    if (min_muts_cluster!=-1 & min_frac_muts_cluster!=-1) print("Found entries for both min_muts_cluster and min_frac_muts_cluster, used which yielded the largest number")
     
     min_muts_cluster = ifelse(is.null(min_muts_cluster), -1, min_muts_cluster)
     min_frac_muts_cluster = ifelse(is.null(min_frac_muts_cluster), -1, min_frac_muts_cluster*nrow(dataset$mutCount))
diff --git a/inst/example/dpclust_pipeline.R b/inst/example/dpclust_pipeline.R
index 668fc3a..5efc613 100755
--- a/inst/example/dpclust_pipeline.R
+++ b/inst/example/dpclust_pipeline.R
@@ -20,7 +20,7 @@ option_list = list(
   make_option(c("--burnin"), type="integer", default=1000, help="Number of iterations to discard as burnin", metavar="character"),
   make_option(c("--mut_assignment_type"), type="integer", default=1, help="Mutation assignment method", metavar="character"),
   make_option(c("--min_muts_cluster"), type="integer", default=-1, help="Minimum number of mutations per cluster required for it to be kept in the final output, set to -1 to disable (default), see also --min_frac_muts_cluster", metavar="character"),
-  make_option(c("--min_frac_muts_cluster"), type="integer", default=0.01, help="Minimum fraction of mutations per cluster required for it to be kept in the final output, set to -1 to disable, see also --min_muts_cluster", metavar="character"),
+  make_option(c("--min_frac_muts_cluster"), type="numeric", default=0.01, help="Minimum fraction of mutations per cluster required for it to be kept in the final output, set to -1 to disable, see also --min_muts_cluster", metavar="character"),
   make_option(c("--num_muts_sample"), type="integer", default=50000, help="Number of mutations from which downsampling starts", metavar="character"),
   make_option(c("--bin_size"), type="double", default=NULL, help="Binsize to use when constructing multi-dimensional density - only used when number of samples > 1", metavar="character"),
   make_option(c("--seed"), type="integer", default=123, help="Provide a seed", metavar="character"),
@@ -43,8 +43,8 @@ bin.size = opt$bin_size
 seed = opt$seed
 assign_sampled_muts = opt$assign_sampled_muts
 keep_temp_files = opt$keep_temp_files
-min_muts_cluster = ifelse(opt$min_muts_cluster==-1, NULL, opt$min_muts_cluster) # set absolute minimum number of mutations per cluster
-min_frac_muts_cluster = ifelse(opt$min_frac_muts_cluster==-1, NULL, opt$min_frac_muts_cluster) # set proportional minimum number of mutations per cluster
+min_muts_cluster = opt$min_muts_cluster # set absolute minimum number of mutations per cluster
+min_frac_muts_cluster = opt$min_frac_muts_cluster # set proportional minimum number of mutations per cluster
 
 if (is.null(outdir)) { outdir = getwd() }
 
@@ -82,7 +82,7 @@ subsamples = sample2purity[sample2purity$sample==samplename,]$subsample
 cellularity = sample2purity[sample2purity$sample==samplename,]$cellularity
 
 if ("sex" %in% colnames(sample2purity)) {
-  is.male = sample2purity[sample2purity$sample==samplename,]$sex=="male"
+  is.male = (sample2purity[sample2purity$sample==samplename,]$sex=="male")[1]
   cndatafiles = sample2purity[sample2purity$sample==samplename,]$cndatafile
 } else {
   is.male = T

From 88ebc40f107dbe1639902d7260673a38dd90386c Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Tue, 28 May 2019 13:41:55 +0100
Subject: [PATCH 06/11] Replace replot function call with direct call to
 plotting functions

---
 R/DirichletProcessClustering.R | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/R/DirichletProcessClustering.R b/R/DirichletProcessClustering.R
index 398cef4..0925d96 100755
--- a/R/DirichletProcessClustering.R
+++ b/R/DirichletProcessClustering.R
@@ -482,13 +482,30 @@ writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfi
       
       # if 1D clustering, then replot without the removed cluster
       if (ncol(dataset$mutCount)==1) {
-        replot_1D(outdir=outdir,
-                  outfiles.prefix=outfiles.prefix,
-                  samplename=samplename, 
-                  dataset=dataset, 
-                  clustering=clustering, 
-                  density=density, 
-                  polygon.data=polygon.data)
+        # Old plot
+        plot1D(density=density, 
+               polygon.data=polygon.data[,1], 
+               pngFile=paste(outdir, "/", samplename, "_DirichletProcessplot_with_cluster_locations.png", sep=""), 
+               density.from=0, 
+               x.max=1.5, 
+               mutationCopyNumber=dataset$mutation.copy.number, 
+               no.chrs.bearing.mut=dataset$copyNumberAdjustment,
+               samplename=samplename,
+               cluster.locations=clustering$cluster.locations,
+               mutation.assignments=clustering$best.node.assignments)
+        
+        # New plot
+        plot1D_2(density=density, 
+                 polygon.data=polygon.data[,1],
+                 pngFile=paste(outdir, "/", samplename, "_DirichletProcessplot_with_cluster_locations_2.png", sep=""), 
+                 density.from=0, 
+                 x.max=1.5, 
+                 mutationCopyNumber=dataset$mutation.copy.number, 
+                 no.chrs.bearing.mut=dataset$copyNumberAdjustment,
+                 samplename=samplename,
+                 cluster.locations=clustering$cluster.locations,
+                 mutation.assignments=clustering$best.node.assignments,
+                 mutationTypes=dataset$mutationType)
       }
     }
   }

From 1962273be4bbfd32397e360fa936e9f28144b9df Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Tue, 28 May 2019 15:25:56 +0100
Subject: [PATCH 07/11] Removing NAs when plotting cluster locations and
 cleanup of min_muts_cluster parameters

---
 R/DirichletProcessClustering.R | 17 +++++++++++------
 R/PlotDensities.R              |  2 +-
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/R/DirichletProcessClustering.R b/R/DirichletProcessClustering.R
index 0925d96..839b1ce 100755
--- a/R/DirichletProcessClustering.R
+++ b/R/DirichletProcessClustering.R
@@ -362,8 +362,13 @@ RunDP <- function(analysis_type, run_params, sample_params, advanced_params, out
     } else {
       density = NA
     }
+    
     polygon_file = file.path(outdir, paste(samplename, "_DirichletProcessplotpolygonData.txt", sep=""))
-    polygon.data = ifelse(file.exists(polygon_file), read.table(polygon_file, header=T), NA)
+    if (file.exists(polygon_file)) {
+      polygon.data = read.table(polygon_file, header=T)
+    } else {
+      polygon.data = NA
+    }
     
     load(file.path(outdir, paste(samplename, "_gsdata.RData", sep="")))
     write_tree = analysis_type != 'nd_dp' & analysis_type != 'reassign_muts_1d' & analysis_type != 'reassign_muts_nd'
@@ -451,11 +456,11 @@ writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfi
   ########################################################################
   # Check for too small clusters
   ########################################################################
-  if (nrow(clustering$cluster.locations) > 1 & (!is.null(min_muts_cluster) | !is.null(min_frac_muts_cluster))) {
+  if (nrow(clustering$cluster.locations) > 1 & (min_muts_cluster!=-1 | min_frac_muts_cluster!=-1)) {
     if (min_muts_cluster!=-1 & min_frac_muts_cluster!=-1) print("Found entries for both min_muts_cluster and min_frac_muts_cluster, used which yielded the largest number")
     
-    min_muts_cluster = ifelse(is.null(min_muts_cluster), -1, min_muts_cluster)
-    min_frac_muts_cluster = ifelse(is.null(min_frac_muts_cluster), -1, min_frac_muts_cluster*nrow(dataset$mutCount))
+    # min_muts_cluster = ifelse(is.null(min_muts_cluster), -1, min_muts_cluster)
+    min_frac_muts_cluster = ifelse(min_frac_muts_cluster==-1, -1, min_frac_muts_cluster*nrow(dataset$mutCount))
     
     # use min_muts_cluster variable further down, overwrite with min_frac_muts_cluster
     if (min_frac_muts_cluster > min_muts_cluster) {
@@ -483,7 +488,7 @@ writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfi
       # if 1D clustering, then replot without the removed cluster
       if (ncol(dataset$mutCount)==1) {
         # Old plot
-        plot1D(density=density, 
+        DPClust:::plot1D(density=density, 
                polygon.data=polygon.data[,1], 
                pngFile=paste(outdir, "/", samplename, "_DirichletProcessplot_with_cluster_locations.png", sep=""), 
                density.from=0, 
@@ -495,7 +500,7 @@ writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfi
                mutation.assignments=clustering$best.node.assignments)
         
         # New plot
-        plot1D_2(density=density, 
+        DPClust:::plot1D_2(density=density, 
                  polygon.data=polygon.data[,1],
                  pngFile=paste(outdir, "/", samplename, "_DirichletProcessplot_with_cluster_locations_2.png", sep=""), 
                  density.from=0, 
diff --git a/R/PlotDensities.R b/R/PlotDensities.R
index 339c5b9..c6c9b2e 100755
--- a/R/PlotDensities.R
+++ b/R/PlotDensities.R
@@ -64,7 +64,7 @@ plot1D = function(density, polygon.data, pngFile=NA, density.from=0, x.max=NA, y
   if(!is.null(cluster.locations) & !is.null(mutation.assignments)) {
     assign.counts = table(mutation.assignments)
     
-    for (cluster in unique(mutation.assignments)) {
+    for (cluster in unique(mutation.assignments[!is.na(mutation.assignments)])) {
       x = cluster.locations[cluster.locations[,1]==cluster, 2]
       lines(x=c(x, x), y=c(0, y.max), col="black", lwd=3)
       text(paste("Cluster",cluster, sep=" "), x=x+0.01, y=(9/10)*y.max, adj=c(0,0), cex=2)

From dc446bab1f735ade67cc782898857a26889f12b1 Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Tue, 28 May 2019 17:32:28 +0100
Subject: [PATCH 08/11] Catch case of wrong variable being called and add
 removal of txt files

---
 R/DirichletProcessClustering.R | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/R/DirichletProcessClustering.R b/R/DirichletProcessClustering.R
index 839b1ce..c93dd12 100755
--- a/R/DirichletProcessClustering.R
+++ b/R/DirichletProcessClustering.R
@@ -418,9 +418,12 @@ RunDP <- function(analysis_type, run_params, sample_params, advanced_params, out
     .remove_file(file.path(outdir, paste(samplename, "_localMultidimensionalOptima_0.01.txt", sep="")))
     .remove_file(file.path(outdir, paste(samplename, "_optimaInfo_0.01.txt", sep="")))
     
-    for (i in 1:(length(subsamplesrun)-1)) {
-      for (j in (i+1):length(subsamplesrun)) {
+    for (i in 1:(length(subsamples)-1)) {
+      for (j in (i+1):length(subsamples)) {
         .remove_file(file.path(outdir, paste(samplename, subsamples[i], subsamples[j], "_densityoutput.RData", sep="")))
+        .remove_file(file.path(outdir, pattern=paste(samplename, subsamples[i], subsamples[j], "*densityData1.csv", sep="")))
+        density_csv_files = list.files(outdir, pattern=paste(samplename, subsamples[i], subsamples[j], "*vals.csv", sep=""), full.names=T)
+        for (infile in density_csv_files) { .remove_file(infile) }
       }
     }
     
@@ -488,7 +491,7 @@ writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfi
       # if 1D clustering, then replot without the removed cluster
       if (ncol(dataset$mutCount)==1) {
         # Old plot
-        DPClust:::plot1D(density=density, 
+        plot1D(density=density, 
                polygon.data=polygon.data[,1], 
                pngFile=paste(outdir, "/", samplename, "_DirichletProcessplot_with_cluster_locations.png", sep=""), 
                density.from=0, 
@@ -500,7 +503,7 @@ writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfi
                mutation.assignments=clustering$best.node.assignments)
         
         # New plot
-        DPClust:::plot1D_2(density=density, 
+        plot1D_2(density=density, 
                  polygon.data=polygon.data[,1],
                  pngFile=paste(outdir, "/", samplename, "_DirichletProcessplot_with_cluster_locations_2.png", sep=""), 
                  density.from=0, 

From cfc7753badfc74caa0c072d9bfd65067e1af80ac Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Wed, 29 May 2019 10:57:46 +0100
Subject: [PATCH 09/11] Fix removal of files

---
 R/DirichletProcessClustering.R | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/R/DirichletProcessClustering.R b/R/DirichletProcessClustering.R
index c93dd12..ad2098a 100755
--- a/R/DirichletProcessClustering.R
+++ b/R/DirichletProcessClustering.R
@@ -421,8 +421,9 @@ RunDP <- function(analysis_type, run_params, sample_params, advanced_params, out
     for (i in 1:(length(subsamples)-1)) {
       for (j in (i+1):length(subsamples)) {
         .remove_file(file.path(outdir, paste(samplename, subsamples[i], subsamples[j], "_densityoutput.RData", sep="")))
-        .remove_file(file.path(outdir, pattern=paste(samplename, subsamples[i], subsamples[j], "*densityData1.csv", sep="")))
-        density_csv_files = list.files(outdir, pattern=paste(samplename, subsamples[i], subsamples[j], "*vals.csv", sep=""), full.names=T)
+        .remove_file(file.path(outdir, paste(samplename, subsamples[i], subsamples[j], "_densityoutput.csv", sep="")))
+        .remove_file(file.path(outdir, pattern=glob2rx(paste(samplename, subsamples[i], subsamples[j], "*densityData1.csv", sep="")), full.names=T))
+        density_csv_files = list.files(outdir, pattern=glob2rx(paste(samplename, subsamples[i], subsamples[j], "*vals.csv", sep="")), full.names=T)
         for (infile in density_csv_files) { .remove_file(infile) }
       }
     }
@@ -472,7 +473,7 @@ writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfi
     
     clusters_to_remove = clustering$cluster.locations[,num_samples+2] < min_muts_cluster
     
-    if (sum(clusters_to_remove) > 0) {
+    if (sum(clusters_to_remove) > 0 & sum(clusters_to_remove) < length(clusters_to_remove)) {
       # remove clusters that are too small
       clusterids_to_remove = clustering$cluster.locations[clusters_to_remove,1]
       new_cluster.locations = clustering$cluster.locations[!clustering$cluster.locations[,1] %in% clusterids_to_remove,,drop=F]

From c35fc777451b73e102cb4cc1d3831311a62a16ce Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Wed, 29 May 2019 11:52:15 +0100
Subject: [PATCH 10/11] Update documentation and version number

---
 DESCRIPTION                     |  2 +-
 README.md                       |  4 ++--
 man/make_run_params.Rd          | 13 ++++++++++---
 man/writeStandardFinalOutput.Rd | 13 +++++++++++--
 4 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 5973f82..f9230f6 100755
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -3,7 +3,7 @@ Maintainer: Stefan Dentro <sd11@sanger.ac.uk>
 License: GPL-3
 Type: Package
 Title: DPClust - An R package for subclonal reconstruction of tumours using sequencing data
-Version: 2.2.6
+Version: 2.2.7
 Authors@R: c(person("David", "Wedge", role=c("aut"), email="dw9@sanger.ac.uk"),
     person("Peter", "Van Loo", role=c("aut")),
     person("Stefan","Dentro", email="sd11@sanger.ac.uk", role=c("aut", "cre")))
diff --git a/README.md b/README.md
index 4bbe900..b2af712 100644
--- a/README.md
+++ b/README.md
@@ -35,12 +35,12 @@ cd dpclust/inst/example
 
 Run DPClust on provided example data. After checking out this repository, build the image:
 ```
-docker build -t dpclust:2.2.6 .
+docker build -t dpclust:2.2.7 .
 ```
 
 Run DPClust as follows
 ```
-docker run -it -v `pwd`:/mnt/output/ dpclust:2.2.6 /opt/dpclust/example/run_docker.sh
+docker run -it -v `pwd`:/mnt/output/ dpclust:2.2.7 /opt/dpclust/example/run_docker.sh
 ```
 
 ## Input description
diff --git a/man/make_run_params.Rd b/man/make_run_params.Rd
index 35b6303..b291a35 100755
--- a/man/make_run_params.Rd
+++ b/man/make_run_params.Rd
@@ -5,9 +5,10 @@
 \title{Helper function to package run parameters}
 \usage{
 make_run_params(no.iters, no.iters.burn.in, mut.assignment.type,
-  num_muts_sample, is.male, assign_sampled_muts = TRUE,
-  supported_chroms = NULL, keep_temp_files = TRUE,
-  generate_cluster_ordering = FALSE)
+  num_muts_sample, is.male, min_muts_cluster = NULL,
+  min_frac_muts_cluster = 0.01, species = "human",
+  assign_sampled_muts = TRUE, supported_chroms = NULL,
+  keep_temp_files = TRUE, generate_cluster_ordering = FALSE)
 }
 \arguments{
 \item{no.iters}{The number of iterations that the MCMC chain should be run for}
@@ -20,6 +21,12 @@ make_run_params(no.iters, no.iters.burn.in, mut.assignment.type,
 
 \item{is.male}{Boolean set to TRUE when the donor is male, female otherwise}
 
+\item{min_muts_cluster}{The minimum number of mutations required for a cluster to be kept in the final output (Default: NULL)}
+
+\item{min_frac_muts_cluster}{The minimum fraction of mutations required for a cluster to be kept in the final output (Default: 0.01)}
+
+\item{species}{Species (Default: Human)}
+
 \item{assign_sampled_muts}{A boolean whether to assign mutations that have not been used for clustering due to downsampling (Default: TRUE)}
 
 \item{supported_chroms}{Vector with chromosome names from which mutations can be used (Default: NULL)}
diff --git a/man/writeStandardFinalOutput.Rd b/man/writeStandardFinalOutput.Rd
index 665f807..bde2211 100755
--- a/man/writeStandardFinalOutput.Rd
+++ b/man/writeStandardFinalOutput.Rd
@@ -5,8 +5,9 @@
 \title{Function that stores the final output in a unified format on disk}
 \usage{
 writeStandardFinalOutput(clustering, dataset, most.similar.mut,
-  outfiles.prefix, samplename, subsamplenames, GS.data, density, no.iters,
-  no.iters.burn.in, assign_sampled_muts = T, write_tree = F,
+  outfiles.prefix, outdir, samplename, subsamplenames, GS.data, density,
+  polygon.data, no.iters, no.iters.burn.in, min_muts_cluster,
+  min_frac_muts_cluster, assign_sampled_muts = T, write_tree = F,
   generate_cluster_ordering = F, no.samples.cluster.order = 1000)
 }
 \arguments{
@@ -18,6 +19,8 @@ writeStandardFinalOutput(clustering, dataset, most.similar.mut,
 
 \item{outfiles.prefix}{A prefix for the filenames}
 
+\item{outdir}{Output directory where the replot of the 1D method is to be stored if clusters are removed due to being too small}
+
 \item{samplename}{Overall samplename}
 
 \item{subsamplenames}{Samplenames of the different timepoints}
@@ -26,10 +29,16 @@ writeStandardFinalOutput(clustering, dataset, most.similar.mut,
 
 \item{density}{Posterior density estimate across MCMC iterations}
 
+\item{polygon.data}{1D confidence interval, used for plotting the 1D method (set to NA for multi-D method)}
+
 \item{no.iters}{Number of total iterations}
 
 \item{no.iters.burn.in}{Number of iterations to be used as burn-in}
 
+\item{min_muts_cluster}{The minimum number of mutations required for a cluster to be kept in the final output}
+
+\item{min_frac_muts_cluster}{The minimum fraction of mutations required for a cluster to be kept in the final}
+
 \item{assign_sampled_muts}{Boolean whether to assign the non-sampled mutations (Default: TRUE)}
 
 \item{write_tree}{Boolean whether to write a tree to file. Not all clustering methods return a tree (Default: FALSE)}

From f6b12e88247d4a1871962bab52797e06a32c26cf Mon Sep 17 00:00:00 2001
From: Stefan Dentro <sd11@sanger.ac.uk>
Date: Thu, 30 May 2019 21:25:34 +0100
Subject: [PATCH 11/11] Catch case of single optimum in nd clustering

---
 R/AssignMutations.R | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/R/AssignMutations.R b/R/AssignMutations.R
index 9ed38d9..06b1014 100755
--- a/R/AssignMutations.R
+++ b/R/AssignMutations.R
@@ -456,9 +456,8 @@ multiDimensionalClustering = function(mutation.copy.number, copyNumberAdjustment
   localMins = localMins + hypercube.size
   localOptima = array(rep(range[,1],each=no.subsamples),dim(localMins))
   localOptima = localOptima + array(rep((range[,2] - range[,1])/(gridsize-1),each=no.subsamples),dim(localMins)) * localMins
-  
   write.table(cbind(localOptima,above95confidence),paste(new_output_folder,"/",samplename,"_localMultidimensionalOptima_",density.smooth,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names = c(paste(samplename,subsamples,sep=""),"above95percentConfidence"))
-  write.table(localOptima[above95confidence,],paste(new_output_folder,"/",samplename,"_localHighConfidenceMultidimensionalOptima_",density.smooth,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names = paste(samplename,subsamples,sep=""))
+  write.table(localOptima[above95confidence,,drop=F],paste(new_output_folder,"/",samplename,"_localHighConfidenceMultidimensionalOptima_",density.smooth,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names = paste(samplename,subsamples,sep=""))
   
   no.optima = nrow(localOptima)
   
@@ -614,11 +613,18 @@ multiDimensionalClustering = function(mutation.copy.number, copyNumberAdjustment
     write.table(CIs,paste(new_output_folder,"/",samplename,"_confInts_",density.smooth,".txt",sep=""),col.names = paste(rep(paste(samplename,subsamples,sep=""),each=2),rep(c(".lower.CI",".upper.CI"),no.subsamples),sep=""),row.names=F,sep="\t",quote=F)
   }else{
     most.likely.cluster = rep(1,no.muts)
+    cluster.locations = matrix(NA, nrow=1, ncol=length(subsamples)+3)
+    cluster.locations[1,1] = 1
+    cluster.locations[1,2:(length(subsamples)+1)] = localOptima[1,]
+    cluster.locations[1,(length(subsamples)+2):ncol(cluster.locations)] = c(no.muts, no.muts)
+    mutation.preferences = data.frame(rep(1, no.muts))
+    #cluster.locations = as.data.frame(cluster.locations)
+    #colnames(cluster.locations) = c("cluster.no", paste(samplename, subsamples, sep=""), "estimated.no.of.mutations", "no.of.mutations.assigned")
     warning("No local optima found")
   }
   
   # Remove the estimated number of SNVs per cluster from the cluster locations table  
-  cluster.locations = as.data.frame(cluster.locations[,c(1:(length(subsamples)+1),ncol(cluster.locations))])
+  cluster.locations = as.data.frame(cluster.locations[,c(1:(length(subsamples)+1),ncol(cluster.locations)), drop=F])
   # Report only the clusters that have mutations assigned
   non_empty_clusters = cluster.locations[,ncol(cluster.locations)] > 0
   cluster.locations = cluster.locations[non_empty_clusters,,drop=F]
@@ -629,7 +635,7 @@ multiDimensionalClustering = function(mutation.copy.number, copyNumberAdjustment
   clust_order = order(rowSums(cluster.locations[,2:(ncol(cluster.locations)-1)]), decreasing=T)
   cluster.locations = cluster.locations[clust_order,,drop=F]
   cluster.locations[,1] = 1:nrow(cluster.locations)
-  mutation.preferences = mutation.preferences[,clust_order]
+  mutation.preferences = mutation.preferences[,clust_order, drop=F]
 
   # get most likely cluster
   most.likely.cluster = max.col(mutation.preferences)