From 7ecec055ae2b9470a9cabf0a62a957a66afb7e94 Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Tue, 28 May 2019 09:25:23 +0100 Subject: [PATCH 01/11] Fix issue when only a single density optimum with mutations is found --- R/AssignMutations.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/AssignMutations.R b/R/AssignMutations.R index 1b42617..9ed38d9 100755 --- a/R/AssignMutations.R +++ b/R/AssignMutations.R @@ -91,7 +91,7 @@ oneDimensionalClustering <- function(samplename, subclonal.fraction, GS.data, de clust_order = order(cluster_locations[,2], decreasing=T) cluster_locations = cluster_locations[clust_order,,drop=F] cluster_locations[,1] = 1:nrow(cluster_locations) - mutation.preferences = mutation.preferences[,clust_order] + mutation.preferences = mutation.preferences[,clust_order, drop=F] # get most likely cluster most.likely.cluster = max.col(mutation.preferences) From 0247402e399c1a282e9858af37780832b2f6abd9 Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Tue, 28 May 2019 09:47:27 +0100 Subject: [PATCH 02/11] Adding in making of 2D plots and removal of created density files, if requested to remove temp output --- R/DirichletProcessClustering.R | 38 ++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/R/DirichletProcessClustering.R b/R/DirichletProcessClustering.R index b6a338a..4f875b3 100755 --- a/R/DirichletProcessClustering.R +++ b/R/DirichletProcessClustering.R @@ -392,6 +392,12 @@ RunDP <- function(analysis_type, run_params, sample_params, advanced_params, out .remove_file(file.path(outdir, paste(samplename, "_localMultidimensionalOptima_0.01.txt", sep=""))) .remove_file(file.path(outdir, paste(samplename, "_optimaInfo_0.01.txt", sep=""))) + for (i in 1:(length(subsamplesrun)-1)) { + for (j in (i+1):length(subsamplesrun)) { + .remove_file(file.path(outdir, paste(samplename, subsamples[i], subsamples[j], "_densityoutput.RData", sep=""))) + } + } + nd_density_files = list.files(outdir, pattern="_2D_binomial_") if (length(nd_density_files) > 0) { file.remove(nd_density_files) } } @@ -697,24 +703,24 @@ DirichletProcessClustering <- function(mutCount, WTCount, totalCopyNumber, copyN ######################## # Plot density and Assign mutations to clusters - nD ######################## - print("Assigning mutations to clusters...") - # print("Estimating density between pairs of samples...") - # for (i in 1:(length(subsamplesrun)-1)) { - # for (j in (i+1):length(subsamplesrun)) { - # print(paste("Samples", subsamplesrun[i], "and", subsamplesrun[j], sep=" ")) - # imageFile = file.path(output_folder, paste(samplename,subsamplesrun[i],subsamplesrun[j],"_iters",no.iters,"_concParam",conc_param,"_clusterWidth",1/cluster_conc,"_2D_binomial.png",sep="")) - # density = Gibbs.subclone.density.est(mutation.copy.number[,c(i,j)]/copyNumberAdjustment[,c(i,j)], - # GS.data, - # imageFile, - # post.burn.in.start = no.iters.burn.in, - # post.burn.in.stop = no.iters, - # samplenames = paste(samplename,subsamplesrun[c(i,j)],sep=""), - # indices=c(i,j)) - # save(file=file.path(output_folder, paste(samplename, subsamplesrun[i], subsamplesrun[j], "_densityoutput.RData", sep="")), GS.data, density) - # } - # } + print("Estimating density between pairs of samples...") + for (i in 1:(length(subsamplesrun)-1)) { + for (j in (i+1):length(subsamplesrun)) { + print(paste("Samples", subsamplesrun[i], "and", subsamplesrun[j], sep=" ")) + imageFile = file.path(output_folder, paste(samplename,subsamplesrun[i],subsamplesrun[j],"_iters",no.iters,"_concParam",conc_param,"_clusterWidth",1/cluster_conc,"_2D_binomial.png",sep="")) + density = Gibbs.subclone.density.est(mutation.copy.number[,c(i,j)]/copyNumberAdjustment[,c(i,j)], + GS.data, + imageFile, + post.burn.in.start = no.iters.burn.in, + post.burn.in.stop = no.iters, + samplenames = paste(samplename,subsamplesrun[c(i,j)],sep=""), + indices=c(i,j)) + save(file=file.path(output_folder, paste(samplename, subsamplesrun[i], subsamplesrun[j], "_densityoutput.RData", sep="")), GS.data, density) + } + } # Assign mutations to clusters using one of the different assignment methods + print("Assigning mutations to clusters...") opts = list(samplename=samplename, subsamplenames=subsamplesrun, no.iters=no.iters, no.iters.burn.in=no.iters.burn.in, no.iters.post.burn.in=no.iters-no.iters.burn.in, outdir=output_folder) if (mut.assignment.type == 1) { consClustering = multiDimensionalClustering(mutation.copy.number=mutation.copy.number, From d812df4a3d8a724df4a5c23471e2a2bed6dbd69f Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Tue, 28 May 2019 11:27:29 +0100 Subject: [PATCH 03/11] Removal of small clusters code added --- R/DirichletProcessClustering.R | 73 +++++++++++++++++++++++++++++---- inst/example/dpclust_pipeline.R | 9 +++- 2 files changed, 72 insertions(+), 10 deletions(-) diff --git a/R/DirichletProcessClustering.R b/R/DirichletProcessClustering.R index 4f875b3..111e8ee 100755 --- a/R/DirichletProcessClustering.R +++ b/R/DirichletProcessClustering.R @@ -7,6 +7,9 @@ #' @param no.iters.burn.in The number of iterations that should be discarded as burn in of the MCMC chain #' @param mut.assignment.type Mutation assignment type option #' @param num_muts_sample The number of mutations from which to start downsampling +#' @param min_muts_cluster The minimum number of mutations required for a cluster to be kept in the final output (Default: NULL) +#' @param min_frac_muts_cluster The minimum fraction of mutations required for a cluster to be kept in the final output (Default: 0.01) +#' @param species Species (Default: Human) #' @param is.male Boolean set to TRUE when the donor is male, female otherwise #' @param assign_sampled_muts A boolean whether to assign mutations that have not been used for clustering due to downsampling (Default: TRUE) #' @param supported_chroms Vector with chromosome names from which mutations can be used (Default: NULL) @@ -15,19 +18,30 @@ #' @return A list containing these components #' @author sd11 #' @export -make_run_params = function(no.iters, no.iters.burn.in, mut.assignment.type, num_muts_sample, is.male, assign_sampled_muts=TRUE, supported_chroms=NULL, keep_temp_files=TRUE, generate_cluster_ordering=FALSE) { +make_run_params = function(no.iters, no.iters.burn.in, mut.assignment.type, num_muts_sample, is.male, min_muts_cluster=NULL, min_frac_muts_cluster=0.01, species="human", assign_sampled_muts=TRUE, supported_chroms=NULL, keep_temp_files=TRUE, generate_cluster_ordering=FALSE) { if (is.null(supported_chroms)) { - # Set the expected chromosomes based on the sex - if (is.male) { - supported_chroms = as.character(c(1:22, "X", "Y")) + if (species=="human" | species=="Human") { + # Set the expected chromosomes based on the sex + if (is.male) { + supported_chroms = as.character(c(1:22, "X", "Y")) + } else { + supported_chroms = as.character(c(1:22, "X")) + } } else { - supported_chroms = as.character(c(1:22, "X")) + if (species=="mouse" | species=="Mouse") { + # Set the expected chromosomes based on the sex + if (is.male) { + supported_chroms = as.character(c(1:19, "X", "Y")) + } else { + supported_chroms = as.character(c(1:19, "X")) + } + } } } return(list(no.iters=no.iters, no.iters.burn.in=no.iters.burn.in, mut.assignment.type=mut.assignment.type, supported_chroms=supported_chroms, num_muts_sample=num_muts_sample, assign_sampled_muts=assign_sampled_muts, keep_temp_files=keep_temp_files, - generate_cluster_ordering=generate_cluster_ordering)) + generate_cluster_ordering=generate_cluster_ordering, species=species, min_muts_cluster=min_muts_cluster, min_frac_muts_cluster=min_frac_muts_cluster)) } #' Helper function to package sample parameters @@ -362,7 +376,9 @@ RunDP <- function(analysis_type, run_params, sample_params, advanced_params, out no.iters.burn.in=no.iters.burn.in, assign_sampled_muts=assign_sampled_muts, write_tree=write_tree, - generate_cluster_ordering=generate_cluster_ordering) + generate_cluster_ordering=generate_cluster_ordering, + min_muts_cluster=min_muts_cluster, + min_frac_muts_cluster=min_frac_muts_cluster) } #################################################################################################################### @@ -415,14 +431,55 @@ RunDP <- function(analysis_type, run_params, sample_params, advanced_params, out #' @param density Posterior density estimate across MCMC iterations #' @param no.iters Number of total iterations #' @param no.iters.burn.in Number of iterations to be used as burn-in +#' @param min_muts_cluster The minimum number of mutations required for a cluster to be kept in the final output +#' @param min_frac_muts_cluster The minimum fraction of mutations required for a cluster to be kept in the final #' @param assign_sampled_muts Boolean whether to assign the non-sampled mutations (Default: TRUE) #' @param write_tree Boolean whether to write a tree to file. Not all clustering methods return a tree (Default: FALSE) #' @param generate_cluster_ordering Boolean specifying whether a possible cluster ordering should be determined (Default: FALSE) #' @param no.samples.cluster.order Number of mutations to sample (with replacement) to classify pairs of clusters into parent-offspring or siblings (Default: 1000) #' @author sd11 -writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfiles.prefix, samplename, subsamplenames, GS.data, density, no.iters, no.iters.burn.in, assign_sampled_muts=T, write_tree=F, generate_cluster_ordering=F, no.samples.cluster.order=1000) { +writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfiles.prefix, samplename, subsamplenames, GS.data, density, no.iters, no.iters.burn.in, min_muts_cluster, min_frac_muts_cluster, assign_sampled_muts=T, write_tree=F, generate_cluster_ordering=F, no.samples.cluster.order=1000) { num_samples = ncol(dataset$mutCount) + ######################################################################## + # Check for too small clusters + ######################################################################## + if (nrow(clustering$cluster.locations) > 1 & (!is.null(min_muts_cluster) | !is.null(min_frac_muts_cluster))) { + if (!is.null(min_muts_cluster) & !is.null(min_frac_muts_cluster)) print("Found entries for both min_muts_cluster and min_frac_muts_cluster, used which yielded the largest number") + + min_muts_cluster = ifelse(is.null(min_muts_cluster), -1, min_muts_cluster) + min_frac_muts_cluster = ifelse(is.null(min_frac_muts_cluster), -1, min_frac_muts_cluster*nrow(dataset$mutCount)) + + # use min_muts_cluster variable further down, overwrite with min_frac_muts_cluster + if (min_frac_muts_cluster > min_muts_cluster) { + min_muts_cluster = min_frac_muts_cluster + } + + clusters_to_remove = clustering$cluster.locations[,num_samples+2] < min_muts_cluster + + if (sum(clusters_to_remove) > 0) { + # remove clusters that are too small + clusterids_to_remove = clustering$cluster.locations[clusters_to_remove,1] + new_cluster.locations = clustering$cluster.locations[!clustering$cluster.locations[,1] %in% clusterids_to_remove,,drop=F] + new_all.assignment.likelihoods = clustering$all.assignment.likelihoods[,!clusters_to_remove, drop=F] + new_best.assignment.likelihoods = clustering$best.assignment.likelihoods + new_best.node.assignments = clustering$best.node.assignments + # reset best likelihoods and hard assignments for mutations assigned to the removed cluster(s) + new_best.assignment.likelihoods[new_best.node.assignments %in% clusterids_to_remove] = NA + new_best.node.assignments[new_best.node.assignments %in% clusterids_to_remove] = NA + + clustering$cluster.locations = new_cluster.locations + clustering$all.assignment.likelihoods = new_all.assignment.likelihoods + clustering$best.node.assignments = new_best.node.assignments + clustering$best.assignment.likelihoods = new_best.node.assignments + + # if 1D clustering, then replot without the removed cluster + if (ncol(dataset$mutCount)==1) { + # todo + } + } + } + if (generate_cluster_ordering) { ######################################################################## # Before doing anything else, calculate confidence intervals on the cluster locations using only the mutations used during clustering diff --git a/inst/example/dpclust_pipeline.R b/inst/example/dpclust_pipeline.R index ad78fd1..668fc3a 100755 --- a/inst/example/dpclust_pipeline.R +++ b/inst/example/dpclust_pipeline.R @@ -19,6 +19,8 @@ option_list = list( make_option(c("--iterations"), type="integer", default=2000, help="Number of iterations to run the MCMC chain", metavar="character"), make_option(c("--burnin"), type="integer", default=1000, help="Number of iterations to discard as burnin", metavar="character"), make_option(c("--mut_assignment_type"), type="integer", default=1, help="Mutation assignment method", metavar="character"), + make_option(c("--min_muts_cluster"), type="integer", default=-1, help="Minimum number of mutations per cluster required for it to be kept in the final output, set to -1 to disable (default), see also --min_frac_muts_cluster", metavar="character"), + make_option(c("--min_frac_muts_cluster"), type="integer", default=0.01, help="Minimum fraction of mutations per cluster required for it to be kept in the final output, set to -1 to disable, see also --min_muts_cluster", metavar="character"), make_option(c("--num_muts_sample"), type="integer", default=50000, help="Number of mutations from which downsampling starts", metavar="character"), make_option(c("--bin_size"), type="double", default=NULL, help="Binsize to use when constructing multi-dimensional density - only used when number of samples > 1", metavar="character"), make_option(c("--seed"), type="integer", default=123, help="Provide a seed", metavar="character"), @@ -41,6 +43,8 @@ bin.size = opt$bin_size seed = opt$seed assign_sampled_muts = opt$assign_sampled_muts keep_temp_files = opt$keep_temp_files +min_muts_cluster = ifelse(opt$min_muts_cluster==-1, NULL, opt$min_muts_cluster) # set absolute minimum number of mutations per cluster +min_frac_muts_cluster = ifelse(opt$min_frac_muts_cluster==-1, NULL, opt$min_frac_muts_cluster) # set proportional minimum number of mutations per cluster if (is.null(outdir)) { outdir = getwd() } @@ -54,6 +58,7 @@ is.male = T sample.snvs.only = T # Perform sampling on just the SNVs and not on CNAs remove.snvs = F # Clear all SNVs, to perform clustering on CNAs only - This needs a better solution generate_cluster_ordering = F +species = "human" # mouse also supported, just changes the chromosomes on which mutations are kept, has not effect on functionality # Cocluster CNA parameters co_cluster_cna = F @@ -104,12 +109,12 @@ print("") ##################################################################################### # Create output path ##################################################################################### -outdir = paste(outdir, "/", samplename, "_DPoutput_", no.iters,"iters_", no.iters.burn.in, "burnin_seed", seed, "/", sep="") +outdir = file.path(outdir, paste(samplename, "_DPoutput_", no.iters,"iters_", no.iters.burn.in, "burnin_seed", seed, "/", sep="")) ##################################################################################### # Setup parameters ##################################################################################### -run_params = make_run_params(no.iters, no.iters.burn.in, mut.assignment.type, num_muts_sample, is.male=is.male, assign_sampled_muts=assign_sampled_muts, keep_temp_files=keep_temp_files, generate_cluster_ordering=generate_cluster_ordering) +run_params = make_run_params(no.iters, no.iters.burn.in, mut.assignment.type, num_muts_sample, is.male=is.male, min_muts_cluster=min_muts_cluster, min_frac_muts_cluster=min_frac_muts_cluster, species=species, assign_sampled_muts=assign_sampled_muts, keep_temp_files=keep_temp_files, generate_cluster_ordering=generate_cluster_ordering) sample_params = make_sample_params(datafiles, cellularity, is.male, samplename, subsamples, mutphasingfiles) advanced_params = make_advanced_params(seed) From 6efd33c3f6e7acf1ffa32c862614e355580b7555 Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Tue, 28 May 2019 12:16:01 +0100 Subject: [PATCH 04/11] Allow for replot of 1D figures after a cluster is removed --- R/DirichletProcessClustering.R | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/R/DirichletProcessClustering.R b/R/DirichletProcessClustering.R index 111e8ee..d346d1c 100755 --- a/R/DirichletProcessClustering.R +++ b/R/DirichletProcessClustering.R @@ -362,16 +362,21 @@ RunDP <- function(analysis_type, run_params, sample_params, advanced_params, out } else { density = NA } + polygon_file = file.path(outdir, paste(samplename, "_DirichletProcessplotpolygonData.txt", sep="")) + polygon.data = ifelse(file.exists(polygon_file), read.table(polygon_file, header=T), NA) + load(file.path(outdir, paste(samplename, "_gsdata.RData", sep=""))) write_tree = analysis_type != 'nd_dp' & analysis_type != 'reassign_muts_1d' & analysis_type != 'reassign_muts_nd' writeStandardFinalOutput(clustering=clustering, dataset=dataset, most.similar.mut=most.similar.mut, outfiles.prefix=outfiles.prefix, + outdir=outdir, samplename=samplename, subsamplenames=subsamples, GS.data=GS.data, density=density, + polygon.data=polygon.data, no.iters=no.iters, no.iters.burn.in=no.iters.burn.in, assign_sampled_muts=assign_sampled_muts, @@ -425,10 +430,12 @@ RunDP <- function(analysis_type, run_params, sample_params, advanced_params, out #' @param dataset The dataset that went into clustering #' @param most.similar.mut Vector containing for each non-sampled mutation its most similar sampled mutation. The non-sampled mutation will be assigned to the same cluster #' @param outfiles.prefix A prefix for the filenames +#' @param outdir Output directory where the replot of the 1D method is to be stored if clusters are removed due to being too small #' @param samplename Overall samplename #' @param subsamplenames Samplenames of the different timepoints #' @param GS.data MCMC output #' @param density Posterior density estimate across MCMC iterations +#' @param polygon.data 1D confidence interval, used for plotting the 1D method (set to NA for multi-D method) #' @param no.iters Number of total iterations #' @param no.iters.burn.in Number of iterations to be used as burn-in #' @param min_muts_cluster The minimum number of mutations required for a cluster to be kept in the final output @@ -438,7 +445,7 @@ RunDP <- function(analysis_type, run_params, sample_params, advanced_params, out #' @param generate_cluster_ordering Boolean specifying whether a possible cluster ordering should be determined (Default: FALSE) #' @param no.samples.cluster.order Number of mutations to sample (with replacement) to classify pairs of clusters into parent-offspring or siblings (Default: 1000) #' @author sd11 -writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfiles.prefix, samplename, subsamplenames, GS.data, density, no.iters, no.iters.burn.in, min_muts_cluster, min_frac_muts_cluster, assign_sampled_muts=T, write_tree=F, generate_cluster_ordering=F, no.samples.cluster.order=1000) { +writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfiles.prefix, outdir, samplename, subsamplenames, GS.data, density, polygon.data, no.iters, no.iters.burn.in, min_muts_cluster, min_frac_muts_cluster, assign_sampled_muts=T, write_tree=F, generate_cluster_ordering=F, no.samples.cluster.order=1000) { num_samples = ncol(dataset$mutCount) ######################################################################## @@ -471,11 +478,17 @@ writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfi clustering$cluster.locations = new_cluster.locations clustering$all.assignment.likelihoods = new_all.assignment.likelihoods clustering$best.node.assignments = new_best.node.assignments - clustering$best.assignment.likelihoods = new_best.node.assignments + clustering$best.assignment.likelihoods = new_best.assignment.likelihoods # if 1D clustering, then replot without the removed cluster if (ncol(dataset$mutCount)==1) { - # todo + replot_1D(outdir=outdir, + outfiles.prefix=outfiles.prefix, + samplename=samplename, + dataset=dataset, + clustering=clustering, + density=density, + polygon.data=polygon.data) } } } From d2ec9c695e112200478459e323f5391a69989760 Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Tue, 28 May 2019 13:18:14 +0100 Subject: [PATCH 05/11] Change handling of default values of minimum cluster size --- R/DirichletProcessClustering.R | 2 +- inst/example/dpclust_pipeline.R | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/R/DirichletProcessClustering.R b/R/DirichletProcessClustering.R index d346d1c..398cef4 100755 --- a/R/DirichletProcessClustering.R +++ b/R/DirichletProcessClustering.R @@ -452,7 +452,7 @@ writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfi # Check for too small clusters ######################################################################## if (nrow(clustering$cluster.locations) > 1 & (!is.null(min_muts_cluster) | !is.null(min_frac_muts_cluster))) { - if (!is.null(min_muts_cluster) & !is.null(min_frac_muts_cluster)) print("Found entries for both min_muts_cluster and min_frac_muts_cluster, used which yielded the largest number") + if (min_muts_cluster!=-1 & min_frac_muts_cluster!=-1) print("Found entries for both min_muts_cluster and min_frac_muts_cluster, used which yielded the largest number") min_muts_cluster = ifelse(is.null(min_muts_cluster), -1, min_muts_cluster) min_frac_muts_cluster = ifelse(is.null(min_frac_muts_cluster), -1, min_frac_muts_cluster*nrow(dataset$mutCount)) diff --git a/inst/example/dpclust_pipeline.R b/inst/example/dpclust_pipeline.R index 668fc3a..5efc613 100755 --- a/inst/example/dpclust_pipeline.R +++ b/inst/example/dpclust_pipeline.R @@ -20,7 +20,7 @@ option_list = list( make_option(c("--burnin"), type="integer", default=1000, help="Number of iterations to discard as burnin", metavar="character"), make_option(c("--mut_assignment_type"), type="integer", default=1, help="Mutation assignment method", metavar="character"), make_option(c("--min_muts_cluster"), type="integer", default=-1, help="Minimum number of mutations per cluster required for it to be kept in the final output, set to -1 to disable (default), see also --min_frac_muts_cluster", metavar="character"), - make_option(c("--min_frac_muts_cluster"), type="integer", default=0.01, help="Minimum fraction of mutations per cluster required for it to be kept in the final output, set to -1 to disable, see also --min_muts_cluster", metavar="character"), + make_option(c("--min_frac_muts_cluster"), type="numeric", default=0.01, help="Minimum fraction of mutations per cluster required for it to be kept in the final output, set to -1 to disable, see also --min_muts_cluster", metavar="character"), make_option(c("--num_muts_sample"), type="integer", default=50000, help="Number of mutations from which downsampling starts", metavar="character"), make_option(c("--bin_size"), type="double", default=NULL, help="Binsize to use when constructing multi-dimensional density - only used when number of samples > 1", metavar="character"), make_option(c("--seed"), type="integer", default=123, help="Provide a seed", metavar="character"), @@ -43,8 +43,8 @@ bin.size = opt$bin_size seed = opt$seed assign_sampled_muts = opt$assign_sampled_muts keep_temp_files = opt$keep_temp_files -min_muts_cluster = ifelse(opt$min_muts_cluster==-1, NULL, opt$min_muts_cluster) # set absolute minimum number of mutations per cluster -min_frac_muts_cluster = ifelse(opt$min_frac_muts_cluster==-1, NULL, opt$min_frac_muts_cluster) # set proportional minimum number of mutations per cluster +min_muts_cluster = opt$min_muts_cluster # set absolute minimum number of mutations per cluster +min_frac_muts_cluster = opt$min_frac_muts_cluster # set proportional minimum number of mutations per cluster if (is.null(outdir)) { outdir = getwd() } @@ -82,7 +82,7 @@ subsamples = sample2purity[sample2purity$sample==samplename,]$subsample cellularity = sample2purity[sample2purity$sample==samplename,]$cellularity if ("sex" %in% colnames(sample2purity)) { - is.male = sample2purity[sample2purity$sample==samplename,]$sex=="male" + is.male = (sample2purity[sample2purity$sample==samplename,]$sex=="male")[1] cndatafiles = sample2purity[sample2purity$sample==samplename,]$cndatafile } else { is.male = T From 88ebc40f107dbe1639902d7260673a38dd90386c Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Tue, 28 May 2019 13:41:55 +0100 Subject: [PATCH 06/11] Replace replot function call with direct call to plotting functions --- R/DirichletProcessClustering.R | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/R/DirichletProcessClustering.R b/R/DirichletProcessClustering.R index 398cef4..0925d96 100755 --- a/R/DirichletProcessClustering.R +++ b/R/DirichletProcessClustering.R @@ -482,13 +482,30 @@ writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfi # if 1D clustering, then replot without the removed cluster if (ncol(dataset$mutCount)==1) { - replot_1D(outdir=outdir, - outfiles.prefix=outfiles.prefix, - samplename=samplename, - dataset=dataset, - clustering=clustering, - density=density, - polygon.data=polygon.data) + # Old plot + plot1D(density=density, + polygon.data=polygon.data[,1], + pngFile=paste(outdir, "/", samplename, "_DirichletProcessplot_with_cluster_locations.png", sep=""), + density.from=0, + x.max=1.5, + mutationCopyNumber=dataset$mutation.copy.number, + no.chrs.bearing.mut=dataset$copyNumberAdjustment, + samplename=samplename, + cluster.locations=clustering$cluster.locations, + mutation.assignments=clustering$best.node.assignments) + + # New plot + plot1D_2(density=density, + polygon.data=polygon.data[,1], + pngFile=paste(outdir, "/", samplename, "_DirichletProcessplot_with_cluster_locations_2.png", sep=""), + density.from=0, + x.max=1.5, + mutationCopyNumber=dataset$mutation.copy.number, + no.chrs.bearing.mut=dataset$copyNumberAdjustment, + samplename=samplename, + cluster.locations=clustering$cluster.locations, + mutation.assignments=clustering$best.node.assignments, + mutationTypes=dataset$mutationType) } } } From 1962273be4bbfd32397e360fa936e9f28144b9df Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Tue, 28 May 2019 15:25:56 +0100 Subject: [PATCH 07/11] Removing NAs when plotting cluster locations and cleanup of min_muts_cluster parameters --- R/DirichletProcessClustering.R | 17 +++++++++++------ R/PlotDensities.R | 2 +- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/R/DirichletProcessClustering.R b/R/DirichletProcessClustering.R index 0925d96..839b1ce 100755 --- a/R/DirichletProcessClustering.R +++ b/R/DirichletProcessClustering.R @@ -362,8 +362,13 @@ RunDP <- function(analysis_type, run_params, sample_params, advanced_params, out } else { density = NA } + polygon_file = file.path(outdir, paste(samplename, "_DirichletProcessplotpolygonData.txt", sep="")) - polygon.data = ifelse(file.exists(polygon_file), read.table(polygon_file, header=T), NA) + if (file.exists(polygon_file)) { + polygon.data = read.table(polygon_file, header=T) + } else { + polygon.data = NA + } load(file.path(outdir, paste(samplename, "_gsdata.RData", sep=""))) write_tree = analysis_type != 'nd_dp' & analysis_type != 'reassign_muts_1d' & analysis_type != 'reassign_muts_nd' @@ -451,11 +456,11 @@ writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfi ######################################################################## # Check for too small clusters ######################################################################## - if (nrow(clustering$cluster.locations) > 1 & (!is.null(min_muts_cluster) | !is.null(min_frac_muts_cluster))) { + if (nrow(clustering$cluster.locations) > 1 & (min_muts_cluster!=-1 | min_frac_muts_cluster!=-1)) { if (min_muts_cluster!=-1 & min_frac_muts_cluster!=-1) print("Found entries for both min_muts_cluster and min_frac_muts_cluster, used which yielded the largest number") - min_muts_cluster = ifelse(is.null(min_muts_cluster), -1, min_muts_cluster) - min_frac_muts_cluster = ifelse(is.null(min_frac_muts_cluster), -1, min_frac_muts_cluster*nrow(dataset$mutCount)) + # min_muts_cluster = ifelse(is.null(min_muts_cluster), -1, min_muts_cluster) + min_frac_muts_cluster = ifelse(min_frac_muts_cluster==-1, -1, min_frac_muts_cluster*nrow(dataset$mutCount)) # use min_muts_cluster variable further down, overwrite with min_frac_muts_cluster if (min_frac_muts_cluster > min_muts_cluster) { @@ -483,7 +488,7 @@ writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfi # if 1D clustering, then replot without the removed cluster if (ncol(dataset$mutCount)==1) { # Old plot - plot1D(density=density, + DPClust:::plot1D(density=density, polygon.data=polygon.data[,1], pngFile=paste(outdir, "/", samplename, "_DirichletProcessplot_with_cluster_locations.png", sep=""), density.from=0, @@ -495,7 +500,7 @@ writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfi mutation.assignments=clustering$best.node.assignments) # New plot - plot1D_2(density=density, + DPClust:::plot1D_2(density=density, polygon.data=polygon.data[,1], pngFile=paste(outdir, "/", samplename, "_DirichletProcessplot_with_cluster_locations_2.png", sep=""), density.from=0, diff --git a/R/PlotDensities.R b/R/PlotDensities.R index 339c5b9..c6c9b2e 100755 --- a/R/PlotDensities.R +++ b/R/PlotDensities.R @@ -64,7 +64,7 @@ plot1D = function(density, polygon.data, pngFile=NA, density.from=0, x.max=NA, y if(!is.null(cluster.locations) & !is.null(mutation.assignments)) { assign.counts = table(mutation.assignments) - for (cluster in unique(mutation.assignments)) { + for (cluster in unique(mutation.assignments[!is.na(mutation.assignments)])) { x = cluster.locations[cluster.locations[,1]==cluster, 2] lines(x=c(x, x), y=c(0, y.max), col="black", lwd=3) text(paste("Cluster",cluster, sep=" "), x=x+0.01, y=(9/10)*y.max, adj=c(0,0), cex=2) From dc446bab1f735ade67cc782898857a26889f12b1 Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Tue, 28 May 2019 17:32:28 +0100 Subject: [PATCH 08/11] Catch case of wrong variable being called and add removal of txt files --- R/DirichletProcessClustering.R | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/R/DirichletProcessClustering.R b/R/DirichletProcessClustering.R index 839b1ce..c93dd12 100755 --- a/R/DirichletProcessClustering.R +++ b/R/DirichletProcessClustering.R @@ -418,9 +418,12 @@ RunDP <- function(analysis_type, run_params, sample_params, advanced_params, out .remove_file(file.path(outdir, paste(samplename, "_localMultidimensionalOptima_0.01.txt", sep=""))) .remove_file(file.path(outdir, paste(samplename, "_optimaInfo_0.01.txt", sep=""))) - for (i in 1:(length(subsamplesrun)-1)) { - for (j in (i+1):length(subsamplesrun)) { + for (i in 1:(length(subsamples)-1)) { + for (j in (i+1):length(subsamples)) { .remove_file(file.path(outdir, paste(samplename, subsamples[i], subsamples[j], "_densityoutput.RData", sep=""))) + .remove_file(file.path(outdir, pattern=paste(samplename, subsamples[i], subsamples[j], "*densityData1.csv", sep=""))) + density_csv_files = list.files(outdir, pattern=paste(samplename, subsamples[i], subsamples[j], "*vals.csv", sep=""), full.names=T) + for (infile in density_csv_files) { .remove_file(infile) } } } @@ -488,7 +491,7 @@ writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfi # if 1D clustering, then replot without the removed cluster if (ncol(dataset$mutCount)==1) { # Old plot - DPClust:::plot1D(density=density, + plot1D(density=density, polygon.data=polygon.data[,1], pngFile=paste(outdir, "/", samplename, "_DirichletProcessplot_with_cluster_locations.png", sep=""), density.from=0, @@ -500,7 +503,7 @@ writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfi mutation.assignments=clustering$best.node.assignments) # New plot - DPClust:::plot1D_2(density=density, + plot1D_2(density=density, polygon.data=polygon.data[,1], pngFile=paste(outdir, "/", samplename, "_DirichletProcessplot_with_cluster_locations_2.png", sep=""), density.from=0, From cfc7753badfc74caa0c072d9bfd65067e1af80ac Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Wed, 29 May 2019 10:57:46 +0100 Subject: [PATCH 09/11] Fix removal of files --- R/DirichletProcessClustering.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/R/DirichletProcessClustering.R b/R/DirichletProcessClustering.R index c93dd12..ad2098a 100755 --- a/R/DirichletProcessClustering.R +++ b/R/DirichletProcessClustering.R @@ -421,8 +421,9 @@ RunDP <- function(analysis_type, run_params, sample_params, advanced_params, out for (i in 1:(length(subsamples)-1)) { for (j in (i+1):length(subsamples)) { .remove_file(file.path(outdir, paste(samplename, subsamples[i], subsamples[j], "_densityoutput.RData", sep=""))) - .remove_file(file.path(outdir, pattern=paste(samplename, subsamples[i], subsamples[j], "*densityData1.csv", sep=""))) - density_csv_files = list.files(outdir, pattern=paste(samplename, subsamples[i], subsamples[j], "*vals.csv", sep=""), full.names=T) + .remove_file(file.path(outdir, paste(samplename, subsamples[i], subsamples[j], "_densityoutput.csv", sep=""))) + .remove_file(file.path(outdir, pattern=glob2rx(paste(samplename, subsamples[i], subsamples[j], "*densityData1.csv", sep="")), full.names=T)) + density_csv_files = list.files(outdir, pattern=glob2rx(paste(samplename, subsamples[i], subsamples[j], "*vals.csv", sep="")), full.names=T) for (infile in density_csv_files) { .remove_file(infile) } } } @@ -472,7 +473,7 @@ writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfi clusters_to_remove = clustering$cluster.locations[,num_samples+2] < min_muts_cluster - if (sum(clusters_to_remove) > 0) { + if (sum(clusters_to_remove) > 0 & sum(clusters_to_remove) < length(clusters_to_remove)) { # remove clusters that are too small clusterids_to_remove = clustering$cluster.locations[clusters_to_remove,1] new_cluster.locations = clustering$cluster.locations[!clustering$cluster.locations[,1] %in% clusterids_to_remove,,drop=F] From c35fc777451b73e102cb4cc1d3831311a62a16ce Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Wed, 29 May 2019 11:52:15 +0100 Subject: [PATCH 10/11] Update documentation and version number --- DESCRIPTION | 2 +- README.md | 4 ++-- man/make_run_params.Rd | 13 ++++++++++--- man/writeStandardFinalOutput.Rd | 13 +++++++++++-- 4 files changed, 24 insertions(+), 8 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5973f82..f9230f6 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Maintainer: Stefan Dentro License: GPL-3 Type: Package Title: DPClust - An R package for subclonal reconstruction of tumours using sequencing data -Version: 2.2.6 +Version: 2.2.7 Authors@R: c(person("David", "Wedge", role=c("aut"), email="dw9@sanger.ac.uk"), person("Peter", "Van Loo", role=c("aut")), person("Stefan","Dentro", email="sd11@sanger.ac.uk", role=c("aut", "cre"))) diff --git a/README.md b/README.md index 4bbe900..b2af712 100644 --- a/README.md +++ b/README.md @@ -35,12 +35,12 @@ cd dpclust/inst/example Run DPClust on provided example data. After checking out this repository, build the image: ``` -docker build -t dpclust:2.2.6 . +docker build -t dpclust:2.2.7 . ``` Run DPClust as follows ``` -docker run -it -v `pwd`:/mnt/output/ dpclust:2.2.6 /opt/dpclust/example/run_docker.sh +docker run -it -v `pwd`:/mnt/output/ dpclust:2.2.7 /opt/dpclust/example/run_docker.sh ``` ## Input description diff --git a/man/make_run_params.Rd b/man/make_run_params.Rd index 35b6303..b291a35 100755 --- a/man/make_run_params.Rd +++ b/man/make_run_params.Rd @@ -5,9 +5,10 @@ \title{Helper function to package run parameters} \usage{ make_run_params(no.iters, no.iters.burn.in, mut.assignment.type, - num_muts_sample, is.male, assign_sampled_muts = TRUE, - supported_chroms = NULL, keep_temp_files = TRUE, - generate_cluster_ordering = FALSE) + num_muts_sample, is.male, min_muts_cluster = NULL, + min_frac_muts_cluster = 0.01, species = "human", + assign_sampled_muts = TRUE, supported_chroms = NULL, + keep_temp_files = TRUE, generate_cluster_ordering = FALSE) } \arguments{ \item{no.iters}{The number of iterations that the MCMC chain should be run for} @@ -20,6 +21,12 @@ make_run_params(no.iters, no.iters.burn.in, mut.assignment.type, \item{is.male}{Boolean set to TRUE when the donor is male, female otherwise} +\item{min_muts_cluster}{The minimum number of mutations required for a cluster to be kept in the final output (Default: NULL)} + +\item{min_frac_muts_cluster}{The minimum fraction of mutations required for a cluster to be kept in the final output (Default: 0.01)} + +\item{species}{Species (Default: Human)} + \item{assign_sampled_muts}{A boolean whether to assign mutations that have not been used for clustering due to downsampling (Default: TRUE)} \item{supported_chroms}{Vector with chromosome names from which mutations can be used (Default: NULL)} diff --git a/man/writeStandardFinalOutput.Rd b/man/writeStandardFinalOutput.Rd index 665f807..bde2211 100755 --- a/man/writeStandardFinalOutput.Rd +++ b/man/writeStandardFinalOutput.Rd @@ -5,8 +5,9 @@ \title{Function that stores the final output in a unified format on disk} \usage{ writeStandardFinalOutput(clustering, dataset, most.similar.mut, - outfiles.prefix, samplename, subsamplenames, GS.data, density, no.iters, - no.iters.burn.in, assign_sampled_muts = T, write_tree = F, + outfiles.prefix, outdir, samplename, subsamplenames, GS.data, density, + polygon.data, no.iters, no.iters.burn.in, min_muts_cluster, + min_frac_muts_cluster, assign_sampled_muts = T, write_tree = F, generate_cluster_ordering = F, no.samples.cluster.order = 1000) } \arguments{ @@ -18,6 +19,8 @@ writeStandardFinalOutput(clustering, dataset, most.similar.mut, \item{outfiles.prefix}{A prefix for the filenames} +\item{outdir}{Output directory where the replot of the 1D method is to be stored if clusters are removed due to being too small} + \item{samplename}{Overall samplename} \item{subsamplenames}{Samplenames of the different timepoints} @@ -26,10 +29,16 @@ writeStandardFinalOutput(clustering, dataset, most.similar.mut, \item{density}{Posterior density estimate across MCMC iterations} +\item{polygon.data}{1D confidence interval, used for plotting the 1D method (set to NA for multi-D method)} + \item{no.iters}{Number of total iterations} \item{no.iters.burn.in}{Number of iterations to be used as burn-in} +\item{min_muts_cluster}{The minimum number of mutations required for a cluster to be kept in the final output} + +\item{min_frac_muts_cluster}{The minimum fraction of mutations required for a cluster to be kept in the final} + \item{assign_sampled_muts}{Boolean whether to assign the non-sampled mutations (Default: TRUE)} \item{write_tree}{Boolean whether to write a tree to file. Not all clustering methods return a tree (Default: FALSE)} From f6b12e88247d4a1871962bab52797e06a32c26cf Mon Sep 17 00:00:00 2001 From: Stefan Dentro Date: Thu, 30 May 2019 21:25:34 +0100 Subject: [PATCH 11/11] Catch case of single optimum in nd clustering --- R/AssignMutations.R | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/R/AssignMutations.R b/R/AssignMutations.R index 9ed38d9..06b1014 100755 --- a/R/AssignMutations.R +++ b/R/AssignMutations.R @@ -456,9 +456,8 @@ multiDimensionalClustering = function(mutation.copy.number, copyNumberAdjustment localMins = localMins + hypercube.size localOptima = array(rep(range[,1],each=no.subsamples),dim(localMins)) localOptima = localOptima + array(rep((range[,2] - range[,1])/(gridsize-1),each=no.subsamples),dim(localMins)) * localMins - write.table(cbind(localOptima,above95confidence),paste(new_output_folder,"/",samplename,"_localMultidimensionalOptima_",density.smooth,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names = c(paste(samplename,subsamples,sep=""),"above95percentConfidence")) - write.table(localOptima[above95confidence,],paste(new_output_folder,"/",samplename,"_localHighConfidenceMultidimensionalOptima_",density.smooth,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names = paste(samplename,subsamples,sep="")) + write.table(localOptima[above95confidence,,drop=F],paste(new_output_folder,"/",samplename,"_localHighConfidenceMultidimensionalOptima_",density.smooth,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names = paste(samplename,subsamples,sep="")) no.optima = nrow(localOptima) @@ -614,11 +613,18 @@ multiDimensionalClustering = function(mutation.copy.number, copyNumberAdjustment write.table(CIs,paste(new_output_folder,"/",samplename,"_confInts_",density.smooth,".txt",sep=""),col.names = paste(rep(paste(samplename,subsamples,sep=""),each=2),rep(c(".lower.CI",".upper.CI"),no.subsamples),sep=""),row.names=F,sep="\t",quote=F) }else{ most.likely.cluster = rep(1,no.muts) + cluster.locations = matrix(NA, nrow=1, ncol=length(subsamples)+3) + cluster.locations[1,1] = 1 + cluster.locations[1,2:(length(subsamples)+1)] = localOptima[1,] + cluster.locations[1,(length(subsamples)+2):ncol(cluster.locations)] = c(no.muts, no.muts) + mutation.preferences = data.frame(rep(1, no.muts)) + #cluster.locations = as.data.frame(cluster.locations) + #colnames(cluster.locations) = c("cluster.no", paste(samplename, subsamples, sep=""), "estimated.no.of.mutations", "no.of.mutations.assigned") warning("No local optima found") } # Remove the estimated number of SNVs per cluster from the cluster locations table - cluster.locations = as.data.frame(cluster.locations[,c(1:(length(subsamples)+1),ncol(cluster.locations))]) + cluster.locations = as.data.frame(cluster.locations[,c(1:(length(subsamples)+1),ncol(cluster.locations)), drop=F]) # Report only the clusters that have mutations assigned non_empty_clusters = cluster.locations[,ncol(cluster.locations)] > 0 cluster.locations = cluster.locations[non_empty_clusters,,drop=F] @@ -629,7 +635,7 @@ multiDimensionalClustering = function(mutation.copy.number, copyNumberAdjustment clust_order = order(rowSums(cluster.locations[,2:(ncol(cluster.locations)-1)]), decreasing=T) cluster.locations = cluster.locations[clust_order,,drop=F] cluster.locations[,1] = 1:nrow(cluster.locations) - mutation.preferences = mutation.preferences[,clust_order] + mutation.preferences = mutation.preferences[,clust_order, drop=F] # get most likely cluster most.likely.cluster = max.col(mutation.preferences)