Skip to content

Commit

Permalink
Merge pull request #2 from Wedge-Oxford/remove_small_clusters
Browse files Browse the repository at this point in the history
Removal of small clusters
  • Loading branch information
sdentro authored May 30, 2019
2 parents 0247402 + 733a590 commit 159653d
Show file tree
Hide file tree
Showing 8 changed files with 149 additions and 26 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Maintainer: Stefan Dentro <sd11@sanger.ac.uk>
License: GPL-3
Type: Package
Title: DPClust - An R package for subclonal reconstruction of tumours using sequencing data
Version: 2.2.6
Version: 2.2.7
Authors@R: c(person("David", "Wedge", role=c("aut"), email="dw9@sanger.ac.uk"),
person("Peter", "Van Loo", role=c("aut")),
person("Stefan","Dentro", email="sd11@sanger.ac.uk", role=c("aut", "cre")))
Expand Down
14 changes: 10 additions & 4 deletions R/AssignMutations.R
Original file line number Diff line number Diff line change
Expand Up @@ -456,9 +456,8 @@ multiDimensionalClustering = function(mutation.copy.number, copyNumberAdjustment
localMins = localMins + hypercube.size
localOptima = array(rep(range[,1],each=no.subsamples),dim(localMins))
localOptima = localOptima + array(rep((range[,2] - range[,1])/(gridsize-1),each=no.subsamples),dim(localMins)) * localMins

write.table(cbind(localOptima,above95confidence),paste(new_output_folder,"/",samplename,"_localMultidimensionalOptima_",density.smooth,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names = c(paste(samplename,subsamples,sep=""),"above95percentConfidence"))
write.table(localOptima[above95confidence,],paste(new_output_folder,"/",samplename,"_localHighConfidenceMultidimensionalOptima_",density.smooth,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names = paste(samplename,subsamples,sep=""))
write.table(localOptima[above95confidence,,drop=F],paste(new_output_folder,"/",samplename,"_localHighConfidenceMultidimensionalOptima_",density.smooth,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names = paste(samplename,subsamples,sep=""))

no.optima = nrow(localOptima)

Expand Down Expand Up @@ -614,11 +613,18 @@ multiDimensionalClustering = function(mutation.copy.number, copyNumberAdjustment
write.table(CIs,paste(new_output_folder,"/",samplename,"_confInts_",density.smooth,".txt",sep=""),col.names = paste(rep(paste(samplename,subsamples,sep=""),each=2),rep(c(".lower.CI",".upper.CI"),no.subsamples),sep=""),row.names=F,sep="\t",quote=F)
}else{
most.likely.cluster = rep(1,no.muts)
cluster.locations = matrix(NA, nrow=1, ncol=length(subsamples)+3)
cluster.locations[1,1] = 1
cluster.locations[1,2:(length(subsamples)+1)] = localOptima[1,]
cluster.locations[1,(length(subsamples)+2):ncol(cluster.locations)] = c(no.muts, no.muts)
mutation.preferences = data.frame(rep(1, no.muts))
#cluster.locations = as.data.frame(cluster.locations)
#colnames(cluster.locations) = c("cluster.no", paste(samplename, subsamples, sep=""), "estimated.no.of.mutations", "no.of.mutations.assigned")
warning("No local optima found")
}

# Remove the estimated number of SNVs per cluster from the cluster locations table
cluster.locations = as.data.frame(cluster.locations[,c(1:(length(subsamples)+1),ncol(cluster.locations))])
cluster.locations = as.data.frame(cluster.locations[,c(1:(length(subsamples)+1),ncol(cluster.locations)), drop=F])
# Report only the clusters that have mutations assigned
non_empty_clusters = cluster.locations[,ncol(cluster.locations)] > 0
cluster.locations = cluster.locations[non_empty_clusters,,drop=F]
Expand All @@ -629,7 +635,7 @@ multiDimensionalClustering = function(mutation.copy.number, copyNumberAdjustment
clust_order = order(rowSums(cluster.locations[,2:(ncol(cluster.locations)-1)]), decreasing=T)
cluster.locations = cluster.locations[clust_order,,drop=F]
cluster.locations[,1] = 1:nrow(cluster.locations)
mutation.preferences = mutation.preferences[,clust_order]
mutation.preferences = mutation.preferences[,clust_order, drop=F]

# get most likely cluster
most.likely.cluster = max.col(mutation.preferences)
Expand Down
116 changes: 106 additions & 10 deletions R/DirichletProcessClustering.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
#' @param no.iters.burn.in The number of iterations that should be discarded as burn in of the MCMC chain
#' @param mut.assignment.type Mutation assignment type option
#' @param num_muts_sample The number of mutations from which to start downsampling
#' @param min_muts_cluster The minimum number of mutations required for a cluster to be kept in the final output (Default: NULL)
#' @param min_frac_muts_cluster The minimum fraction of mutations required for a cluster to be kept in the final output (Default: 0.01)
#' @param species Species (Default: Human)
#' @param is.male Boolean set to TRUE when the donor is male, female otherwise
#' @param assign_sampled_muts A boolean whether to assign mutations that have not been used for clustering due to downsampling (Default: TRUE)
#' @param supported_chroms Vector with chromosome names from which mutations can be used (Default: NULL)
Expand All @@ -15,19 +18,30 @@
#' @return A list containing these components
#' @author sd11
#' @export
make_run_params = function(no.iters, no.iters.burn.in, mut.assignment.type, num_muts_sample, is.male, assign_sampled_muts=TRUE, supported_chroms=NULL, keep_temp_files=TRUE, generate_cluster_ordering=FALSE) {
make_run_params = function(no.iters, no.iters.burn.in, mut.assignment.type, num_muts_sample, is.male, min_muts_cluster=NULL, min_frac_muts_cluster=0.01, species="human", assign_sampled_muts=TRUE, supported_chroms=NULL, keep_temp_files=TRUE, generate_cluster_ordering=FALSE) {
if (is.null(supported_chroms)) {
# Set the expected chromosomes based on the sex
if (is.male) {
supported_chroms = as.character(c(1:22, "X", "Y"))
if (species=="human" | species=="Human") {
# Set the expected chromosomes based on the sex
if (is.male) {
supported_chroms = as.character(c(1:22, "X", "Y"))
} else {
supported_chroms = as.character(c(1:22, "X"))
}
} else {
supported_chroms = as.character(c(1:22, "X"))
if (species=="mouse" | species=="Mouse") {
# Set the expected chromosomes based on the sex
if (is.male) {
supported_chroms = as.character(c(1:19, "X", "Y"))
} else {
supported_chroms = as.character(c(1:19, "X"))
}
}
}
}

return(list(no.iters=no.iters, no.iters.burn.in=no.iters.burn.in, mut.assignment.type=mut.assignment.type,
supported_chroms=supported_chroms, num_muts_sample=num_muts_sample, assign_sampled_muts=assign_sampled_muts, keep_temp_files=keep_temp_files,
generate_cluster_ordering=generate_cluster_ordering))
generate_cluster_ordering=generate_cluster_ordering, species=species, min_muts_cluster=min_muts_cluster, min_frac_muts_cluster=min_frac_muts_cluster))
}

#' Helper function to package sample parameters
Expand Down Expand Up @@ -348,21 +362,33 @@ RunDP <- function(analysis_type, run_params, sample_params, advanced_params, out
} else {
density = NA
}

polygon_file = file.path(outdir, paste(samplename, "_DirichletProcessplotpolygonData.txt", sep=""))
if (file.exists(polygon_file)) {
polygon.data = read.table(polygon_file, header=T)
} else {
polygon.data = NA
}

load(file.path(outdir, paste(samplename, "_gsdata.RData", sep="")))
write_tree = analysis_type != 'nd_dp' & analysis_type != 'reassign_muts_1d' & analysis_type != 'reassign_muts_nd'
writeStandardFinalOutput(clustering=clustering,
dataset=dataset,
most.similar.mut=most.similar.mut,
outfiles.prefix=outfiles.prefix,
outdir=outdir,
samplename=samplename,
subsamplenames=subsamples,
GS.data=GS.data,
density=density,
polygon.data=polygon.data,
no.iters=no.iters,
no.iters.burn.in=no.iters.burn.in,
assign_sampled_muts=assign_sampled_muts,
write_tree=write_tree,
generate_cluster_ordering=generate_cluster_ordering)
generate_cluster_ordering=generate_cluster_ordering,
min_muts_cluster=min_muts_cluster,
min_frac_muts_cluster=min_frac_muts_cluster)
}

####################################################################################################################
Expand Down Expand Up @@ -392,9 +418,13 @@ RunDP <- function(analysis_type, run_params, sample_params, advanced_params, out
.remove_file(file.path(outdir, paste(samplename, "_localMultidimensionalOptima_0.01.txt", sep="")))
.remove_file(file.path(outdir, paste(samplename, "_optimaInfo_0.01.txt", sep="")))

for (i in 1:(length(subsamplesrun)-1)) {
for (j in (i+1):length(subsamplesrun)) {
for (i in 1:(length(subsamples)-1)) {
for (j in (i+1):length(subsamples)) {
.remove_file(file.path(outdir, paste(samplename, subsamples[i], subsamples[j], "_densityoutput.RData", sep="")))
.remove_file(file.path(outdir, paste(samplename, subsamples[i], subsamples[j], "_densityoutput.csv", sep="")))
.remove_file(file.path(outdir, pattern=glob2rx(paste(samplename, subsamples[i], subsamples[j], "*densityData1.csv", sep="")), full.names=T))
density_csv_files = list.files(outdir, pattern=glob2rx(paste(samplename, subsamples[i], subsamples[j], "*vals.csv", sep="")), full.names=T)
for (infile in density_csv_files) { .remove_file(infile) }
}
}

Expand All @@ -409,20 +439,86 @@ RunDP <- function(analysis_type, run_params, sample_params, advanced_params, out
#' @param dataset The dataset that went into clustering
#' @param most.similar.mut Vector containing for each non-sampled mutation its most similar sampled mutation. The non-sampled mutation will be assigned to the same cluster
#' @param outfiles.prefix A prefix for the filenames
#' @param outdir Output directory where the replot of the 1D method is to be stored if clusters are removed due to being too small
#' @param samplename Overall samplename
#' @param subsamplenames Samplenames of the different timepoints
#' @param GS.data MCMC output
#' @param density Posterior density estimate across MCMC iterations
#' @param polygon.data 1D confidence interval, used for plotting the 1D method (set to NA for multi-D method)
#' @param no.iters Number of total iterations
#' @param no.iters.burn.in Number of iterations to be used as burn-in
#' @param min_muts_cluster The minimum number of mutations required for a cluster to be kept in the final output
#' @param min_frac_muts_cluster The minimum fraction of mutations required for a cluster to be kept in the final
#' @param assign_sampled_muts Boolean whether to assign the non-sampled mutations (Default: TRUE)
#' @param write_tree Boolean whether to write a tree to file. Not all clustering methods return a tree (Default: FALSE)
#' @param generate_cluster_ordering Boolean specifying whether a possible cluster ordering should be determined (Default: FALSE)
#' @param no.samples.cluster.order Number of mutations to sample (with replacement) to classify pairs of clusters into parent-offspring or siblings (Default: 1000)
#' @author sd11
writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfiles.prefix, samplename, subsamplenames, GS.data, density, no.iters, no.iters.burn.in, assign_sampled_muts=T, write_tree=F, generate_cluster_ordering=F, no.samples.cluster.order=1000) {
writeStandardFinalOutput = function(clustering, dataset, most.similar.mut, outfiles.prefix, outdir, samplename, subsamplenames, GS.data, density, polygon.data, no.iters, no.iters.burn.in, min_muts_cluster, min_frac_muts_cluster, assign_sampled_muts=T, write_tree=F, generate_cluster_ordering=F, no.samples.cluster.order=1000) {
num_samples = ncol(dataset$mutCount)

########################################################################
# Check for too small clusters
########################################################################
if (nrow(clustering$cluster.locations) > 1 & (min_muts_cluster!=-1 | min_frac_muts_cluster!=-1)) {
if (min_muts_cluster!=-1 & min_frac_muts_cluster!=-1) print("Found entries for both min_muts_cluster and min_frac_muts_cluster, used which yielded the largest number")

# min_muts_cluster = ifelse(is.null(min_muts_cluster), -1, min_muts_cluster)
min_frac_muts_cluster = ifelse(min_frac_muts_cluster==-1, -1, min_frac_muts_cluster*nrow(dataset$mutCount))

# use min_muts_cluster variable further down, overwrite with min_frac_muts_cluster
if (min_frac_muts_cluster > min_muts_cluster) {
min_muts_cluster = min_frac_muts_cluster
}

clusters_to_remove = clustering$cluster.locations[,num_samples+2] < min_muts_cluster

if (sum(clusters_to_remove) > 0 & sum(clusters_to_remove) < length(clusters_to_remove)) {
# remove clusters that are too small
clusterids_to_remove = clustering$cluster.locations[clusters_to_remove,1]
new_cluster.locations = clustering$cluster.locations[!clustering$cluster.locations[,1] %in% clusterids_to_remove,,drop=F]
new_all.assignment.likelihoods = clustering$all.assignment.likelihoods[,!clusters_to_remove, drop=F]
new_best.assignment.likelihoods = clustering$best.assignment.likelihoods
new_best.node.assignments = clustering$best.node.assignments
# reset best likelihoods and hard assignments for mutations assigned to the removed cluster(s)
new_best.assignment.likelihoods[new_best.node.assignments %in% clusterids_to_remove] = NA
new_best.node.assignments[new_best.node.assignments %in% clusterids_to_remove] = NA

clustering$cluster.locations = new_cluster.locations
clustering$all.assignment.likelihoods = new_all.assignment.likelihoods
clustering$best.node.assignments = new_best.node.assignments
clustering$best.assignment.likelihoods = new_best.assignment.likelihoods

# if 1D clustering, then replot without the removed cluster
if (ncol(dataset$mutCount)==1) {
# Old plot
plot1D(density=density,
polygon.data=polygon.data[,1],
pngFile=paste(outdir, "/", samplename, "_DirichletProcessplot_with_cluster_locations.png", sep=""),
density.from=0,
x.max=1.5,
mutationCopyNumber=dataset$mutation.copy.number,
no.chrs.bearing.mut=dataset$copyNumberAdjustment,
samplename=samplename,
cluster.locations=clustering$cluster.locations,
mutation.assignments=clustering$best.node.assignments)

# New plot
plot1D_2(density=density,
polygon.data=polygon.data[,1],
pngFile=paste(outdir, "/", samplename, "_DirichletProcessplot_with_cluster_locations_2.png", sep=""),
density.from=0,
x.max=1.5,
mutationCopyNumber=dataset$mutation.copy.number,
no.chrs.bearing.mut=dataset$copyNumberAdjustment,
samplename=samplename,
cluster.locations=clustering$cluster.locations,
mutation.assignments=clustering$best.node.assignments,
mutationTypes=dataset$mutationType)
}
}
}

if (generate_cluster_ordering) {
########################################################################
# Before doing anything else, calculate confidence intervals on the cluster locations using only the mutations used during clustering
Expand Down
2 changes: 1 addition & 1 deletion R/PlotDensities.R
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ plot1D = function(density, polygon.data, pngFile=NA, density.from=0, x.max=NA, y
if(!is.null(cluster.locations) & !is.null(mutation.assignments)) {
assign.counts = table(mutation.assignments)

for (cluster in unique(mutation.assignments)) {
for (cluster in unique(mutation.assignments[!is.na(mutation.assignments)])) {
x = cluster.locations[cluster.locations[,1]==cluster, 2]
lines(x=c(x, x), y=c(0, y.max), col="black", lwd=3)
text(paste("Cluster",cluster, sep=" "), x=x+0.01, y=(9/10)*y.max, adj=c(0,0), cex=2)
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,12 @@ cd dpclust/inst/example

Run DPClust on provided example data. After checking out this repository, build the image:
```
docker build -t dpclust:2.2.6 .
docker build -t dpclust:2.2.7 .
```

Run DPClust as follows
```
docker run -it -v `pwd`:/mnt/output/ dpclust:2.2.6 /opt/dpclust/example/run_docker.sh
docker run -it -v `pwd`:/mnt/output/ dpclust:2.2.7 /opt/dpclust/example/run_docker.sh
```

## Input description
Expand Down
Loading

0 comments on commit 159653d

Please sign in to comment.