Merge branch 'dev version 1.0.6'

Wedge-lab · Feb 22, 2017 · 30eb359 · 30eb359
2 parents 515a492 + b7efb24
commit 30eb359
Show file tree

Hide file tree

Showing 30 changed files with 2,351 additions and 602 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,7 @@
 Package: dpclust3p
 Type: Package
 Title: Dirichlet Process Clustering Pre-Processing Package
-Version: 1.0.5
-Date: 2016-08-03
+Version: 1.0.6
 Author: Stefan Dentro
 Maintainer: Stefan Dentro <sd11@sanger.ac.uk>
 License: GPL-3

diff --git a/NAMESPACE b/NAMESPACE
@@ -5,7 +5,6 @@ export(ascatNgsToBattenberg)
 export(ascatToBattenberg)
 export(collate_bb_subclones)
 export(concat_files)
-export(createHistFacetPlot)
 export(createPng)
 export(dpIn2vcf)
 export(dumpCounts.Broad)
@@ -19,13 +18,15 @@ export(filterForDeaminase)
 export(filterForSignature)
 export(getTrinucleotideContext)
 export(identifyKataegis)
-export(meltFacetPlotData)
 export(mut_cn_phasing)
 export(mut_mut_phasing)
 export(mutationBurdenToMutationCopyNumber)
 export(mutationCopyNumberToMutationBurden)
 export(parseFai)
 export(parseIgnore)
+export(plot_ccf_hist)
+export(plot_mcn_hist)
+export(plot_vaf_hist)
 export(runGetDirichletProcessInfo)
 export(split_by_chrom)
 export(vcf2loci)

diff --git a/R/LoadData.R b/R/LoadData.R
diff --git a/R/allelecount.R b/R/allelecount.R
diff --git a/R/copynumber.R b/R/copynumber.R
@@ -45,6 +45,11 @@ collate_bb_subclones = function(samplename, bb_subclones_file, gender, outfile)
     cndata = cndata[!cndata$chr=="X",]
   }
 
+  # Remove all segments witout a call
+  if (any(is.na(cndata$nMin1_A) | is.na(cndata$nMaj1_A))) {
+    cndata = cndata[!(is.na(cndata$nMin1_A) | is.na(cndata$nMaj1_A)),]
+  }
+
   # Obtain the total copy number for each segment
   totalcn = rep(0, dim(cndata)[1])
   for (k in 1:nrow(cndata)) {

diff --git a/R/interconvertMutationBurdens.R b/R/interconvertMutationBurdens.R
diff --git a/R/preprocessing.R b/R/preprocessing.R
diff --git a/R/qualitycontrol.R b/R/qualitycontrol.R
@@ -1,110 +1,77 @@
 ##############################################
 # QC
 ##############################################
+#' Create a PNG file
+#' @param p The figure
+#' @param filename Where to save the image
+#' @param width Width
+#' @param height Height
 #' @export
 createPng <- function(p, filename, width, height) {
   png(filename=filename, width=width, height=height)
   print(p)
   dev.off()
 }
 
-#' @export
+#' Convenience function used for creating figures
+#' @noRd
 meltFacetPlotData <- function(data, subsamplenames) {
   d = as.data.frame(data)
   colnames(d) = subsamplenames
   data.m = reshape2::melt(d)
   return(data.m)
 }
 
-#' @export
+#' Template to create histograms
+#' @noRd
 createHistFacetPlot <- function(data, title, xlab, ylab, binwidth) {
   p = ggplot2::ggplot(data) + ggplot2::aes(x=value, y=..count..) + ggplot2::geom_histogram(binwidth=binwidth, colour="black", fill="gray") + ggplot2::facet_grid(variable ~ .)
   p = p + ggplot2::theme_bw(base_size=35) + ggplot2::ggtitle(title) + ggplot2::xlab(xlab) + ggplot2::ylab(ylab)
   return(p)
 }
 
+#' Template to create box plots
+#' @noRd
 createBoxFacetPlot <- function(data, title, xlab, ylab) {
   p = ggplot2::ggplot(data) + ggplot2::aes(x=variable, y=value) + ggplot2::geom_boxplot() + ggplot2::facet_grid(subsample ~ .)
   p = p + ggplot2::theme_bw() + ggplot2::ggtitle(title) + ggplot2::xlab(xlab) + ggplot2::ylab(ylab)
   return(p)
 }
 
-createQCDocument <- function(res, samplename, subsamplenames, outpath, cellularity) {
-  p = createHistFacetPlot(meltFacetPlotData(res$totalCopyNumber, subsamplenames), paste(samplename, "totalCopyNumber"), "Copynumber", "Count", binwidth=1)
-  createPng(p, paste(outpath, samplename, "_totalCopyNumber.png", sep=""), width=1500, height=500*length(subsamplenames))
-
-  p = createHistFacetPlot(meltFacetPlotData(res$mutation.copy.number, subsamplenames), paste(samplename, "mutation.copy.number"), "mutation.copy.number", "Count", binwidth=0.1)
+#' Plot mutation copy number histogram
+#' 
+#' @param data A DPClust input table
+#' @param samplename Name of the sample
+#' @param outdir Directory where the figure is to be stored
+#' @author sd11
+#' @export
+plot_mcn_hist = function(data, samplename, outdir) {
+  p = createHistFacetPlot(meltFacetPlotData(data$mutation.copy.number, samplename), paste(samplename, "mutation.copy.number"), "mutation.copy.number", "Count", binwidth=0.1)
   p = p + ggplot2::xlim(0,5)
-  createPng(p, paste(outpath, samplename, "_mutation.copy.number.png", sep=""), width=1500, height=500*length(subsamplenames))
-
-  p = createHistFacetPlot(meltFacetPlotData(res$copyNumberAdjustment, subsamplenames), paste(samplename, "copyNumberAdjustment"), "copyNumberAdjustment", "Count (log10)", binwidth=1)
-  p = p + ggplot2::scale_y_log10()
-  createPng(p, paste(outpath, samplename, "_copyNumberAdjustment.png", sep=""), width=1500, height=500*length(subsamplenames))
-
-  p = createHistFacetPlot(meltFacetPlotData(res$mutCount/(res$mutCount+res$WTCount), subsamplenames), paste(samplename, "alleleFrequency"), "Allele Frequency", "Count", binwidth=0.01)
-  createPng(p, paste(outpath, samplename, "_alleleFrequency.png", sep=""), width=1500, height=500*length(subsamplenames))
-
-  p = createHistFacetPlot(meltFacetPlotData(res$kappa, subsamplenames), paste(samplename, "kappa"), "Kappa", "Count", binwidth=0.01)
-  createPng(p, paste(outpath, samplename, "_kappa.png", sep=""), width=1500, height=500*length(subsamplenames))
-  #   p = createHistFacetPlot(meltFacetPlotData(res$subclonal.fraction, subsamplenames), paste(samplename, "subclonal.fraction"), "Subclonal fraction", "Count", binwidth=0.01)
-  #   createPng(p, paste(outpath, samplename, "_subclonal.fraction.png", sep=""), width=1500, height=500*length(subsamplenames))
-
-  p = createHistFacetPlot(meltFacetPlotData((res$mutCount+res$WTCount), subsamplenames), paste(samplename, "depth"), "Depth", "Count", binwidth=5)
-  createPng(p, paste(outpath, samplename, "_depth.png", sep=""), width=1500, height=500*length(subsamplenames))
-
-  #   fractionOfCells = res$mutation.copy.number / res$copyNumberAdjustment
-  #   meltFacetPlotData(fractionOfCells, subsamplenames)
-  p = createHistFacetPlot(meltFacetPlotData(res$subclonal.fraction, subsamplenames), paste(samplename, "Fraction Of Cells"), "Fraction of Cells", "Count", binwidth=0.05)
-  p = p + ggplot2::geom_vline(xintercept=0.5, colour="red", linetype="longdash", size=2) + ggplot2::geom_vline(xintercept=1.5, colour="red", linetype="longdash", size=2) + ggplot2::xlim(0,3) #scale_y_log10()
-  createPng(p, paste(outpath, samplename, "_fractionOfCells.png", sep=""), width=1500, height=500*length(subsamplenames))
-
-  #   manualMutCopyNum = mutationBurdenToMutationCopyNumber(res$mutCount/(res$mutCount+res$WTCount),res$totalCopyNumber ,cellularity)
-  #   p = createHistFacetPlot(meltFacetPlotData(manualMutCopyNum, subsamplenames), paste(samplename, "manualMutCopyNum"), "Mutation copy number", "Count", binwidth=0.1)
-  #   createPng(p, paste(outpath, samplename, "_manualMutCopyNum.png", sep=""), width=500, height=250*length(subsamplenames))
-  #   
-  #   manualFractionOfCells = mutationBurdenToMutationCopyNumber(res$mutCount/(res$mutCount+res$WTCount),res$totalCopyNumber ,cellularity) / res$copyNumberAdjustment
-  #   p = createHistFacetPlot(meltFacetPlotData(manualFractionOfCells, subsamplenames), paste(samplename, "manualFractionOfCells"), "Fraction of Cells", "Count", binwidth=0.01)
-  #   createPng(p, paste(outpath, samplename, "_manualFractionOfCells.png", sep=""), width=1500, height=500*length(subsamplenames))
-
-  # Manually melt the data
-  d.m = data.frame()
-  for (i in 1:length(subsamplenames)) {
-    d.loc = data.frame(subsample=subsamplenames[i],variable=factor(dataset$chromosome[,i]), value=dataset$subclonal.fraction[,i])
-    d.m = rbind(d.m, d.loc)
-  }
-  p = createBoxFacetPlot(d.m, paste(samplename, "subclonal fraction per chrom"), "Chromosome", "Subclonal fraction")
-  createPng(p, paste(outpath, samplename, "_subclonalFractionPerChromosome.png", sep=""), width=1500, height=500*length(subsamplenames))
-
-  # Manually melt the data
-  d = as.data.frame(dataset$chromosome)
-  colnames(d) = subsamplenames
-  d.m = data.frame()
-  for (i in 1:ncol(d)) {
-    d.loc = d[dataset$subclonal.fraction > 1.5,i]
-    if(length(d.loc) > 0) { 
-      d.m = rbind(d.m, data.frame(variable=colnames(d)[i], value=factor(d.loc)))
-    }
-  }
-
-  if (nrow(d.m) > 0) {
-    p = createHistFacetPlot(d.m, paste(samplename, "subclonal fraction > 1.5"), "Chromosome", "Count", binwidth=1)
-    createPng(p, paste(outpath, samplename, "_large.subclonal.fraction.by.chrom.png", sep=""), width=1500, height=500*length(subsamplenames))
-  }
+  createPng(p, file.path(outpath, paste(samplename, "_mutation.copy.number.png", sep="")), width=1500, height=500)
 }
 
-#'
-#' Run the QC on specified data
-#'
-runQc = function(infile, datpath, outpath) {
-  sample2purity = read.table(infile, header=T, stringsAsFactors=F)
-  for (samplename in unique(sample2purity$sample)) {
-    print(samplename)
-    datafiles = sample2purity[sample2purity$sample==samplename,]$datafile
-    subsamples = sample2purity[sample2purity$sample==samplename,]$subsample
-    cellularity = sample2purity[sample2purity$sample==samplename,]$cellularity
-    if (file.exists(paste(datpath,datafiles, sep=""))) {
-      dataset = load.data(datpath,"",datafiles, cellularity=cellularity, Chromosome="chr", position="end", WT.count="WT.count", mut.count="mut.count", subclonal.CN="subclonal.CN", no.chrs.bearing.mut="no.chrs.bearing.mut", mutation.copy.number="mutation.copy.number", subclonal.fraction="subclonal.fraction", data_file_suffix="")
-      createQCDocument(dataset, samplename, subsamples, outpath, cellularity)
-    }
-  }
+#' Plot cancer cell fraction histogram
+#' 
+#' @param data A DPClust input table
+#' @param samplename Name of the sample
+#' @param outdir Directory where the figure is to be stored
+#' @author sd11
+#' @export
+plot_ccf_hist = function(data, samplename, outdir) {
+  p = createHistFacetPlot(meltFacetPlotData(data$subclonal.fraction, samplename), paste(samplename, "Fraction Tumour Cells"), "Fraction Tumour Cells", "Count", binwidth=0.05)
+  p = p + ggplot2::xlim(0,1.5)
+  createPng(p, file.path(outpath, paste(samplename, "_fractionOfCells.png", sep="")), width=1500, height=500)
+}
+
+#' Plot allele frequency histogram
+#' 
+#' @param data A DPClust input table
+#' @param samplename Name of the sample
+#' @param outdir Directory where the figure is to be stored
+#' @author sd11
+#' @export
+plot_vaf_hist = function(data, samplename, outdir) {
+  p = createHistFacetPlot(meltFacetPlotData(data$mutCount/(data$mutCount+data$WTCount), samplename), paste(samplename, "alleleFrequency"), "Allele Frequency", "Count", binwidth=0.01)
+  createPng(p, file.path(outpath, paste(samplename, "_alleleFrequency.png", sep="")), width=1500, height=500)
 }
diff --git a/R/util.R b/R/util.R
@@ -0,0 +1,72 @@
+#' Concatenate split files
+#' 
+#' Convenience function to concatenate a series of files specified in a file of file names.
+#' This function assumes all files have the same layout.
+#' @param fofn A file of file names to be concatenated
+#' @param inputdir Full path to where the input files are stored
+#' @param outfile Full path to where the output should be written
+#' @param haveHeader Boolean that specifies whether the input files have a header
+#' @author sd11
+#' @export
+concat_files = function(fofn, inputdir, outfile, haveHeader) {
+
+  inputdir = paste(inputdir,"/", sep="")
+  list_of_files = read.table(fofn, stringsAsFactors=F, header=F)[,1]
+
+  output = data.frame()
+  for (infile in list_of_files) {
+    infile = paste(inputdir, infile, sep="")
+    # Check if file is there and it contains data
+    if (file.exists(infile) & file.info(infile)$size != 0) {
+      dat = read.delim(infile, header=haveHeader, quote=NULL, stringsAsFactors=F)
+      output = rbind(output, dat)
+    }
+  }
+
+  write.table(output, file=outfile, col.names=haveHeader, row.names=F, sep="\t", quote=F)
+}
+
+#' Split a file per chromosome
+#'
+#' Convenience function to split an input file per chromosome. All it requires is that
+#' the infile has as first column chromosome specification. The output files will be named
+#' outdir/prefixCHROMNUMBERpostfix
+#' @param infile The file to be split
+#' @param prefix Prefix of the output file
+#' @param postfix Postfix of the output file
+#' @param outdir Directory where the output files are to be written
+#' @param chrom_file A simple list of chromosomes to be considered
+#' @author sd11
+#' @export
+split_by_chrom = function(infile, prefix, postfix, outdir, chrom_file) {
+  outdir = paste(outdir, "/", sep="")
+
+  # Check if there are lines in the file, otherwise it will crash this script
+  if (file.info(infile)$size == 0) {
+    print("No lines in loci file")
+    q(save="no")
+  }
+
+  loci = read.delim(infile, stringsAsFactors=F, header=F)
+
+  chroms = read.delim(chrom_file, stringsAsFactors=F, header=F)
+
+  for (i in 1:nrow(chroms)) {
+    selection = loci[loci[,1]==chroms[i,1],]
+    chrom_id = chroms[i,2]
+    write.table(selection, file=paste(outdir, prefix, chrom_id, postfix, sep=""), quote=F, row.names=F, col.names=F, sep="\t")
+  }
+}
+
+#' Calculate power to call subclones. 
+#' 
+#' This function calculates the 
+#' average number of reads per clonal chromosome copy.
+#' @param purity The sample purity
+#' @param ploidy The tumour ploidy
+#' @param coverage The tumour sample coverage
+#' @return The average reads per chromosome copy, a.k.a. power
+#' @author sd11
+calc_power = function(purity, ploidy, coverage) {
+  return(round((purity) / (purity*ploidy + (1-purity)*2) * coverage, 3))
+}