diff --git a/DESCRIPTION b/DESCRIPTION index c63dc23..caa6771 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,7 @@ Package: dpclust3p Type: Package Title: Dirichlet Process Clustering Pre-Processing Package -Version: 1.0.5 -Date: 2016-08-03 +Version: 1.0.6 Author: Stefan Dentro Maintainer: Stefan Dentro License: GPL-3 diff --git a/NAMESPACE b/NAMESPACE index 1532c6b..017337f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,7 +5,6 @@ export(ascatNgsToBattenberg) export(ascatToBattenberg) export(collate_bb_subclones) export(concat_files) -export(createHistFacetPlot) export(createPng) export(dpIn2vcf) export(dumpCounts.Broad) @@ -19,13 +18,15 @@ export(filterForDeaminase) export(filterForSignature) export(getTrinucleotideContext) export(identifyKataegis) -export(meltFacetPlotData) export(mut_cn_phasing) export(mut_mut_phasing) export(mutationBurdenToMutationCopyNumber) export(mutationCopyNumberToMutationBurden) export(parseFai) export(parseIgnore) +export(plot_ccf_hist) +export(plot_mcn_hist) +export(plot_vaf_hist) export(runGetDirichletProcessInfo) export(split_by_chrom) export(vcf2loci) diff --git a/R/LoadData.R b/R/LoadData.R old mode 100755 new mode 100644 diff --git a/R/allelecount.R b/R/allelecount.R new file mode 100644 index 0000000..8c4e6fd --- /dev/null +++ b/R/allelecount.R @@ -0,0 +1,422 @@ +################################################ +# Allele counting functions - SNV, indel and SV +################################################ +#' Run alleleCount +#' +#' Count the alleles for specified locations in the loci file. Expects alleleCount binary in $PATH +#' @param locifile A file with at least chromsome and position columns of the locations to be counted +#' @param bam A bam file +#' @param outfile Where to store the output +#' @param min_baq The minimum base quality required for a read to be counted +#' @param min_maq The minimum mapping quality required for a read to be counted +#' @author sd11 +#' @export +alleleCount = function(locifile, bam, outfile, min_baq=20, min_maq=35) { + cmd = paste(ALLELECOUNTER, + "-b", bam, + "-o", outfile, + "-l", locifile, + "-m", min_baq, + "-q", min_maq, sep=" ") + system(cmd, wait=T) +} + +#' Dump allele counts from vcf - Sanger ICGC pancancer pipeline +#' +#' Dump allele counts stored in the sample columns of the VCF file. Output will go into a file +#' supplied as tumour_outfile and optionally normal_outfile. It will be a fully formatted +#' allele counts file as returned by alleleCounter. +#' @param vcf_infile The vcf file to read in +#' @param tumour_outfile File to save the tumour counts to +#' @param normal_outfile Optional parameter specifying where the normal output should go +#' @param refence_genome Optional parameter specifying the reference genome build used +#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the VCF +#' @author sd11 +#' @export +dumpCounts.Sanger = function(vcf_infile, tumour_outfile, normal_outfile=NA, refence_genome="hg19", samplename=NA) { + dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="sanger", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename) +} + +#' Dump allele counts from vcf - DKFZ ICGC pancancer pipeline +#' +#' Dump allele counts stored in the sample columns of the VCF file. Output will go into a file +#' supplied as tumour_outfile and optionally normal_outfile. It will be a fully formatted +#' allele counts file as returned by alleleCounter. +#' @param vcf_infile The vcf file to read in +#' @param tumour_outfile File to save the tumour counts to +#' @param normal_outfile Optional parameter specifying where the normal output should go +#' @param refence_genome Optional parameter specifying the reference genome build used +#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the VCF +#' @author sd11 +#' @export +dumpCounts.DKFZ = function(vcf_infile, tumour_outfile, normal_outfile=NA, refence_genome="hg19", samplename=NA) { + dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="dkfz", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename) +} + +#' Dump allele counts from vcf - Broad ICGC pancancer pipeline +#' +#' Dump allele counts stored in the sample columns of the VCF file. Output will go into a file +#' supplied as tumour_outfile and optionally normal_outfile. It will be a fully formatted +#' allele counts file as returned by alleleCounter. +#' @param vcf_infile The vcf file to read in +#' @param tumour_outfile File to save the tumour counts to +#' @param normal_outfile Optional parameter specifying where the normal output should go +#' @param refence_genome Optional parameter specifying the reference genome build used +#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the VCF +#' @author sd11 +#' @export +dumpCounts.Broad = function(vcf_infile, tumour_outfile, normal_outfile=NA, refence_genome="hg19", samplename=NA) { + dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="broad", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename) +} + +#' Dump allele counts from vcf - MuSE ICGC pancancer pipeline +#' +#' Dump allele counts stored in the sample columns of the VCF file. Output will go into a file +#' supplied as tumour_outfile and optionally normal_outfile. It will be a fully formatted +#' allele counts file as returned by alleleCounter. +#' @param vcf_infile The vcf file to read in +#' @param tumour_outfile File to save the tumour counts to +#' @param normal_outfile Optional parameter specifying where the normal output should go +#' @param refence_genome Optional parameter specifying the reference genome build used +#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the VCF +#' @author sd11 +#' @export +dumpCounts.Muse = function(vcf_infile, tumour_outfile, normal_outfile=NA, refence_genome="hg19", samplename=NA) { + dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="muse", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename) +} + +#' Dump allele counts from vcf - ICGC pancancer consensus SNV pipeline +#' +#' Dump allele counts stored in the info column of the VCF file. Output will go into a file +#' supplied as tumour_outfile. It will be a fully formatted allele counts file as returned +#' by alleleCounter. There are no counts for the matched normal. +#' @param vcf_infile The vcf file to read in +#' @param tumour_outfile File to save the tumour counts to +#' @param normal_outfile Optional parameter specifying where the normal output should go +#' @param refence_genome Optional parameter specifying the reference genome build used +#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the VCF +#' @author sd11 +#' @export +dumpCounts.ICGCconsensusSNV = function(vcf_infile, tumour_outfile, normal_outfile=NA, refence_genome="hg19", samplename=NA) { + dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="icgc_consensus_snv", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename) +} + +#' Dump allele counts from vcf - ICGC pancancer consensus indel pipeline +#' +#' Dump allele counts stored in the info column of the VCF file. Output will go into a file +#' supplied as tumour_outfile. It will be a fully formatted allele counts file as returned +#' by alleleCounter. There are no counts for the matched normal. +#' @param vcf_infile The vcf file to read in +#' @param tumour_outfile File to save the tumour counts to +#' @param refence_genome Optional parameter specifying the reference genome build used +#' @author sd11 +#' @export +dumpCounts.ICGCconsensusIndel = function(vcf_infile, tumour_outfile, refence_genome="hg19", dummy_alt_allele=NA, dummy_ref_allele=NA) { + dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="icgc_consensus_indel", normal_outfile=NA, refence_genome=refence_genome, samplename=NA, dummy_alt_allele=dummy_alt_allele, dummy_ref_allele=dummy_ref_allele) +} + +#' Dump allele counts from vcf - Mutect +#' +#' Dump allele counts stored in the info column of the VCF file. Output will go into a file +#' supplied as tumour_outfile. It will be a fully formatted allele counts file as returned +#' by alleleCounter. +#' @param vcf_infile The vcf file to read in +#' @param tumour_outfile File to save the tumour counts to +#' @param samplename Parameter specifying the samplename to be used for matching the right column in the VCF +#' @param normal_outfile Optional parameter specifying where the normal output should go +#' @param refence_genome Optional parameter specifying the reference genome build used +#' @author sd11 +#' @export +dumpCounts.mutect = function(vcf_infile, tumour_outfile, samplename, normal_outfile=NA, refence_genome="hg19") { + dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="mutect", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename) +} + +#' Dump allele counts from VCF +#' +#' This function implements all the steps required for dumping counts from VCF +#' as supplied by the ICGC pancancer pipelines. +#' @noRd +dumpCountsFromVcf = function(vcf_infile, tumour_outfile, centre, normal_outfile=NA, refence_genome="hg19", samplename=NA, dummy_alt_allele=NA, dummy_ref_allele=NA) { + # Helper function for writing the output + write.output = function(output, output_file) { + write.table(output, file=output_file, col.names=T, quote=F, row.names=F, sep="\t") + } + + # Read in the vcf and dump the tumour counts in the right format + v = VariantAnnotation::readVcf(vcf_infile, refence_genome) + tumour = getCountsTumour(v, centre=centre, samplename=samplename, dummy_alt_allele=dummy_alt_allele, dummy_ref_allele=dummy_ref_allele) + tumour = formatOutput(tumour, v) + write.output(tumour, tumour_outfile) + + # Optionally dump the normal counts in the right format + if (!is.na(normal_outfile)) { + normal = getCountsNormal(v, centre=centre, samplename=samplename) + normal = formatOutput(normal, v) + write.output(normal, normal_outfile) + } +} + +#' Format a 4 column counts table into the alleleCounter format. This function assumes A, C, G, T format. +#' @noRd +formatOutput = function(counts_table, v) { + output = data.frame(as.character(seqnames(v)), start(ranges(v)), counts_table, rowSums(counts_table)) + colnames(output) = c("#CHR","POS","Count_A","Count_C","Count_G","Count_T","Good_depth") + return(output) +} + +#' Dump allele counts from vcf for normal +#' +#' Returns an allele counts table for the normal sample +#' @param v The vcf file +#' @param centre The sequencing centre of which pipeline the vcf file originates +#' @return An array with 4 columns: Counts for A, C, G, T +#' @author sd11 +#' @noRd +getCountsNormal = function(v, centre="sanger", samplename=NA) { + if (centre=="sanger") { + return(getAlleleCounts.Sanger(v, 1)) + } else if(centre=="dkfz") { + sample_col = which(colnames(VariantAnnotation::geno(v)$DP4) == "CONTROL") + return(getAlleleCounts.DKFZ(v, sample_col)) + } else if (centre=="muse") { + # Assuming the tumour name is provided + sample_col = which(colnames(VariantAnnotation::geno(v)$AD) != samplename) + return (getAlleleCounts.MuSE(v, sample_col)) + } else if (centre=="broad") { + print("The Broad ICGC pipeline does not report allele counts for the matched normal") + q(save="no", status=1) + } else if (centre=="icgc_consensus_snv") { + print("The ICGC consensus pipeline does not report allele counts for the matched normal") + q(save="no", status=1) + } else if (centre=="icgc_consensus_indel") { + print("The ICGC consensus indel pipeline does not report allele counts for the matched normal") + q(save="no", status=1) + } else if (centre=="mutect") { + sample_col = which(colnames(VariantAnnotation::geno(v)$AD) == samplename) + return(getAlleleCounts.mutect(v, sample_col)) + } else { + print(paste("Supplied centre not supported:", centre)) + q(save="no", status=1) + } +} + +#' Dump allele counts from vcf for tumour +#' +#' Returns an allele counts table for the tumour sample +#' @param v The vcf file +#' @param centre The sequencing centre of which pipeline the vcf file originates +#' @param samplename Samplename to be used when matching sample columns (Default: NA) +#' @param dummy_alt_allele Supply if the alt allele is to be overwritten. The counts of the alt will be put in the column corresponding to this allele. (Default: NA) +#' @param dummy_ref_allele Supply if the ref allele is to be overwritten. The counts of the ref will be put in the column corresponding to this allele. (Default: NA) +#' @return An array with 4 columns: Counts for A, C, G, T +#' @author sd11 +#' @noRd +getCountsTumour = function(v, centre="sanger", samplename=NA, dummy_alt_allele=NA, dummy_ref_allele=NA) { + if (centre=="sanger") { + return(getAlleleCounts.Sanger(v, 2)) + } else if (centre=="dkfz") { + sample_col = which(colnames(VariantAnnotation::geno(v)$DP4) == "TUMOR") + return(getAlleleCounts.DKFZ(v, sample_col)) + } else if (centre=="muse") { + if (is.na(samplename)) { stop("When dumping allele counts from the Muse samplename must be supplied") } + sample_col = which(colnames(VariantAnnotation::geno(v)$AD) == samplename) + return (getAlleleCounts.MuSE(v, sample_col)) + } else if (centre=="broad") { + if (is.na(samplename)) { stop("When dumping allele counts from the Broad ICGC pipeline samplename must be supplied") } + return(getAlleleCounts.Broad(v, 1)) + } else if (centre=="icgc_consensus_snv") { + return(getAlleleCounts.ICGC_consensus_snv(v)) + } else if (centre=="icgc_consensus_indel") { + if (is.na(dummy_alt_allele) | is.na(dummy_ref_allele)) { stop("When dumping allele counts from the ICGC consensus indels dummy_alt_allele and dummy_ref_allele must be supplied") } + return(getAlleleCounts.ICGC_consensus_indel(v, dummy_alt_allele=dummy_alt_allele, dummy_ref_allele=dummy_ref_allele)) + } else if (centre=="mutect") { + if (is.na(samplename)) { stop("When dumping allele counts from the Mutect samplename must be supplied") } + sample_col = which(colnames(VariantAnnotation::geno(v)$AD) == samplename) + return(getAlleleCounts.mutect(v, sample_col)) + } else { + print(paste("Supplied centre not supported:", centre)) + q(save="no", status=1) + } +} + +#' Dump allele counts from Sanger pipeline vcf +#' +#' Helper function that dumps the allele counts from a Sanger pipeline VCF file +#' @param v The vcf file +#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1 +#' @return An array with 4 columns: Counts for A, C, G, T +#' @author sd11 +#' @noRd +getAlleleCounts.Sanger = function(v, sample_col) { + return(cbind(VariantAnnotation::geno(v)$FAZ[,sample_col]+VariantAnnotation::geno(v)$RAZ[,sample_col], + VariantAnnotation::geno(v)$FCZ[,sample_col]+VariantAnnotation::geno(v)$RCZ[,sample_col], + VariantAnnotation::geno(v)$FGZ[,sample_col]+VariantAnnotation::geno(v)$RGZ[,sample_col], + VariantAnnotation::geno(v)$FTZ[,sample_col]+VariantAnnotation::geno(v)$RTZ[,sample_col])) +} + +#' Dump allele counts from the DKFZ pipeline +#' +#' Helper function that takes a sample column and fetches allele counts. As the DKFZ pipeline does not +#' provide counts for each base, but just the alt and reference, we will provide just those and the +#' other bases with 0s. +#' +#' Note: If there are multiple ALT alleles this function will only take the first mentioned! +#' @param v The vcf file +#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1 +#' @return An array with 4 columns: Counts for A, C, G, T +#' @author sd11 +#' @noRd +getAlleleCounts.DKFZ = function(v, sample_col) { + # Fetch counts for both forward and reverse ref/alt + counts = VariantAnnotation::geno(v)$DP4[,sample_col,] + counts.ref = counts[,1] + counts[,2] # ref forward/reverse + counts.alt = counts[,3] + counts[,4] # alt forward/reverse + allele.ref = as.character(VariantAnnotation::ref(v)) + allele.alt = unlist(lapply(VariantAnnotation::alt(v), function(x) { as.character(x[[1]]) })) + + output = construct_allelecounter_table(count.ref, count.alt, allele.ref, allele.alt) + return(output) +} + +#' Dump allele counts from the MuSE pipeline +#' +#' Helper function that takes a sample column and fetches allele counts. As the MuSE pipeline does not +#' provide counts for each base, but just the alt and reference, we will provide just those and the +#' other bases with 0s. +#' +#' Note: If there are multiple ALT alleles this function will only take the first mentioned! +#' Note2: This function assumes there are only two columns with read counts. The format allows for more +#' @param v The vcf file +#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1 +#' @return An array with 4 columns: Counts for A, C, G, T +#' @author sd11 +#' @noRd +getAlleleCounts.MuSE = function(v, sample_col) { + if (length(colnames(VariantAnnotation::geno(v)$AD)) > 2) { + print("In getAlleleCounts.MuSE: Assuming 2 columns with read counts, but found more. This is not supported") + q(save="no", status=1) + } + + # An SNV can be represented by more than 1 alt alleles, here we pick the alt allele with the highest read count + num.snvs = nrow(VariantAnnotation::geno(v)$AD) + counts = array(NA, c(num.snvs, 2)) + allele.alt = array(NA, num.snvs) + for (i in 1:num.snvs) { + snv.counts = unlist(VariantAnnotation::geno(v)$AD[i,sample_col]) + counts[i,1] = snv.counts[1] # The reference is the first base for which read counts are mentioned + select_base = which.max(snv.counts[2:length(snv.counts)]) + allele.alt[i] = as.character(VariantAnnotation::alt(v)[[i]][select_base]) + select_base = select_base+1 # The reference is the first base for which read counts are mentioned + counts[i,2] = snv.counts[select_base] + } + + allele.ref = as.character(VariantAnnotation::ref(v)) + + output = construct_allelecounter_table(counts[i,1], counts[i,2], allele.ref, allele.alt) + return(output) +} + +#' Dump allele counts from the Broad pipeline +#' +#' Helper function that takes a sample column and fetches allele counts. As the Broad pipeline does not +#' provide counts for each base, but just the alt and reference, we will provide just those and the +#' other bases with 0s. +#' +#' Note: If there are multiple ALT alleles this function will only take the first mentioned! +#' @param v The vcf file +#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1 +#' @return An array with 4 columns: Counts for A, C, G, T +#' @author sd11 +#' @noRd +getAlleleCounts.Broad = function(v, sample_col) { + count.ref = as.numeric(unlist(VariantAnnotation::geno(v)$ref_count)) + count.alt = as.numeric(unlist(VariantAnnotation::geno(v)$alt_count)) + allele.ref = as.character(VariantAnnotation::ref(v)) + allele.alt = unlist(lapply(VariantAnnotation::alt(v), function(x) { as.character(x[[1]]) })) + + output = construct_allelecounter_table(count.ref, count.alt, allele.ref, allele.alt) + return(output) +} + +#' Dump allele counts in ICGC consensus SNV format +#' +#' This function fetches allele counts from the info field in the VCF file. +#' +#' @param v The vcf file +#' @return An array with 4 columns: Counts for A, C, G, T +#' @author sd11 +#' @noRd +#' Note: If there are multiple ALT alleles this function will only take the first mentioned! +getAlleleCounts.ICGC_consensus_snv = function(v) { + count.alt = info(v)$t_alt_count + count.ref = info(v)$t_ref_count + allele.ref = as.character(VariantAnnotation::ref(v)) + allele.alt = unlist(lapply(VariantAnnotation::alt(v), function(x) { as.character(x[[1]]) })) + + output = construct_allelecounter_table(count.ref, count.alt, allele.ref, allele.alt) + return(output) +} + +#' Dump allele counts in ICGC consensus indel format +#' +#' This function fetches allele counts from the info field in the VCF file. +#' +#' @param v The vcf file +#' @param dummy_alt_allele Dummy base to use to encode the alt allele, the counts will be saved in the corresponding column +#' @param dummy_ref_allele Dummy base to use to encode the ref allele, the counts will be saved in the corresponding column +#' @return An array with 4 columns: Counts for A, C, G, T +#' @author sd11 +#' @noRd +#' Note: If there are multiple ALT alleles this function will only take the first mentioned! +getAlleleCounts.ICGC_consensus_indel = function(v, dummy_alt_allele="A", dummy_ref_allele="C") { + c = construct_allelecounter_table(count.ref=info(v)$t_ref_count, + count.alt=info(v)$t_alt_count, + allele.ref=rep(dummy_ref_allele, nrow(v)), + allele.alt=rep(dummy_alt_allele, nrow(v))) + return(c) +} + +#' Dump allele counts in Mutect format +#' +#' This function fetches allele counts from the info field in the VCF file. +#' +#' @param v The vcf file +#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1 +#' @return An array with 4 columns: Counts for A, C, G, T +#' @author sd11 +#' @noRd +#' Note: If there are multiple ALT alleles this function will only take the first mentioned! +getAlleleCounts.mutect = function(v, sample_col) { + counts = do.call(rbind, geno(v)$AD[,sample_col]) + count.ref = counts[,1] + count.alt = counts[,2] + allele.ref = as.character(VariantAnnotation::ref(v)) + allele.alt = unlist(lapply(VariantAnnotation::alt(v), function(x) { as.character(x[[1]]) })) + output = construct_allelecounter_table(count.ref, count.alt, allele.ref, allele.alt) + return(output) +} + +#' Function that constructs a table in the format of the allele counter +#' @param count.ref Number of reads supporting the reference allele +#' @param count.alt Number of reads supporting the variant allele +#' @param allele.ref The reference allele +#' @param allele.alt The variant allele +#' @return A data.frame consisting of four columns: Reads reporting A, C, G and T +#' @author sd11 +#' @noRd +construct_allelecounter_table = function(count.ref, count.alt, allele.ref, allele.alt) { + output = array(0, c(length(allele.ref), 4)) + nucleotides = c("A", "C", "G", "T") + # Propagate the alt allele counts + nucleo.index = match(allele.alt, nucleotides) + for (i in 1:nrow(output)) { + output[i,nucleo.index[i]] = count.alt[i] + } + + # Propagate the reference allele counts + nucleo.index = match(allele.ref, nucleotides) + for (i in 1:nrow(output)) { + output[i,nucleo.index[i]] = count.ref[i] + } + return(output) +} \ No newline at end of file diff --git a/R/copynumber.R b/R/copynumber.R index cb66bc1..3b5c951 100644 --- a/R/copynumber.R +++ b/R/copynumber.R @@ -45,6 +45,11 @@ collate_bb_subclones = function(samplename, bb_subclones_file, gender, outfile) cndata = cndata[!cndata$chr=="X",] } + # Remove all segments witout a call + if (any(is.na(cndata$nMin1_A) | is.na(cndata$nMaj1_A))) { + cndata = cndata[!(is.na(cndata$nMin1_A) | is.na(cndata$nMaj1_A)),] + } + # Obtain the total copy number for each segment totalcn = rep(0, dim(cndata)[1]) for (k in 1:nrow(cndata)) { diff --git a/R/interconvertMutationBurdens.R b/R/interconvertMutationBurdens.R old mode 100755 new mode 100644 diff --git a/R/preprocessing.R b/R/preprocessing.R index 3e7e9d6..8ae807a 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -1,66 +1,6 @@ ALLELECOUNTER = "alleleCounter" LINKAGEPULL = "Linkage_pull.pl" -#' Concatenate split files -#' -#' Convenience function to concatenate a series of files specified in a file of file names. -#' This function assumes all files have the same layout. -#' @param fofn A file of file names to be concatenated -#' @param inputdir Full path to where the input files are stored -#' @param outfile Full path to where the output should be written -#' @param haveHeader Boolean that specifies whether the input files have a header -#' @author sd11 -#' @export -concat_files = function(fofn, inputdir, outfile, haveHeader) { - - inputdir = paste(inputdir,"/", sep="") - list_of_files = read.table(fofn, stringsAsFactors=F, header=F)[,1] - - output = data.frame() - for (infile in list_of_files) { - infile = paste(inputdir, infile, sep="") - # Check if file is there and it contains data - if (file.exists(infile) & file.info(infile)$size != 0) { - dat = read.delim(infile, header=haveHeader, quote=NULL, stringsAsFactors=F) - output = rbind(output, dat) - } - } - - write.table(output, file=outfile, col.names=haveHeader, row.names=F, sep="\t", quote=F) -} - -#' Split a file per chromosome -#' -#' Convenience function to split an input file per chromosome. All it requires is that -#' the infile has as first column chromosome specification. The output files will be named -#' outdir/prefixCHROMNUMBERpostfix -#' @param infile The file to be split -#' @param prefix Prefix of the output file -#' @param postfix Postfix of the output file -#' @param outdir Directory where the output files are to be written -#' @param chrom_file A simple list of chromosomes to be considered -#' @author sd11 -#' @export -split_by_chrom = function(infile, prefix, postfix, outdir, chrom_file) { - outdir = paste(outdir, "/", sep="") - - # Check if there are lines in the file, otherwise it will crash this script - if (file.info(infile)$size == 0) { - print("No lines in loci file") - q(save="no") - } - - loci = read.delim(infile, stringsAsFactors=F, header=F) - - chroms = read.delim(chrom_file, stringsAsFactors=F, header=F) - - for (i in 1:nrow(chroms)) { - selection = loci[loci[,1]==chroms[i,1],] - chrom_id = chroms[i,2] - write.table(selection, file=paste(outdir, prefix, chrom_id, postfix, sep=""), quote=F, row.names=F, col.names=F, sep="\t") - } -} - ############################################ # VCF 2 LOCI ############################################ @@ -193,7 +133,7 @@ filterForSignature = function(signature_anno_loci_file, signature_regex, outfile #' Convenience function that filters a with tri-nucleotide context annotated list of loci for #' cytosine deaminase signature, or C>T at CpG. -#' @param signature_anno_loci_file Filepath to a with tri-nucleotide context annotated loci +#' @param loci_file Filepath to a with tri-nucleotide context annotated loci #' @param outfile Filepath to where to store the output. #' @param ref_genome Full path to an indexed reference genome fasta file #' @param trinucleotide_column Integer representing the column within the input file that contains the context @@ -211,430 +151,6 @@ filterForDeaminase = function(loci_file, outfile, ref_genome, trinucleotide_colu alt_allele_column=alt_allele_column) } - -############################################ -# Allele counting -############################################ -#' Run alleleCount -#' -#' Count the alleles for specified locations in the loci file. Expects alleleCount binary in $PATH -#' @param locifile A file with at least chromsome and position columns of the locations to be counted -#' @param bam A bam file -#' @param outfile Where to store the output -#' @param min_baq The minimum base quality required for a read to be counted -#' @param min_maq The minimum mapping quality required for a read to be counted -#' @author sd11 -#' @export -alleleCount = function(locifile, bam, outfile, min_baq=20, min_maq=35) { - cmd = paste(ALLELECOUNTER, - "-b", bam, - "-o", outfile, - "-l", locifile, - "-m", min_baq, - "-q", min_maq, sep=" ") - system(cmd, wait=T) -} - -#' Dump allele counts from vcf - Sanger ICGC pancancer pipeline -#' -#' Dump allele counts stored in the sample columns of the VCF file. Output will go into a file -#' supplied as tumour_outfile and optionally normal_outfile. It will be a fully formatted -#' allele counts file as returned by alleleCounter. -#' @param vcf_infile The vcf file to read in -#' @param tumour_outfile File to save the tumour counts to -#' @param normal_outfile Optional parameter specifying where the normal output should go -#' @param refence_genome Optional parameter specifying the reference genome build used -#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the VCF -#' @author sd11 -#' @export -dumpCounts.Sanger = function(vcf_infile, tumour_outfile, normal_outfile=NA, refence_genome="hg19", samplename=NA) { - dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="sanger", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename) -} - -#' Dump allele counts from vcf - DKFZ ICGC pancancer pipeline -#' -#' Dump allele counts stored in the sample columns of the VCF file. Output will go into a file -#' supplied as tumour_outfile and optionally normal_outfile. It will be a fully formatted -#' allele counts file as returned by alleleCounter. -#' @param vcf_infile The vcf file to read in -#' @param tumour_outfile File to save the tumour counts to -#' @param normal_outfile Optional parameter specifying where the normal output should go -#' @param refence_genome Optional parameter specifying the reference genome build used -#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the VCF -#' @author sd11 -#' @export -dumpCounts.DKFZ = function(vcf_infile, tumour_outfile, normal_outfile=NA, refence_genome="hg19", samplename=NA) { - dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="dkfz", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename) -} - -#' Dump allele counts from vcf - Broad ICGC pancancer pipeline -#' -#' Dump allele counts stored in the sample columns of the VCF file. Output will go into a file -#' supplied as tumour_outfile and optionally normal_outfile. It will be a fully formatted -#' allele counts file as returned by alleleCounter. -#' @param vcf_infile The vcf file to read in -#' @param tumour_outfile File to save the tumour counts to -#' @param normal_outfile Optional parameter specifying where the normal output should go -#' @param refence_genome Optional parameter specifying the reference genome build used -#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the VCF -#' @author sd11 -#' @export -dumpCounts.Broad = function(vcf_infile, tumour_outfile, normal_outfile=NA, refence_genome="hg19", samplename=NA) { - dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="broad", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename) -} - -#' Dump allele counts from vcf - MuSE ICGC pancancer pipeline -#' -#' Dump allele counts stored in the sample columns of the VCF file. Output will go into a file -#' supplied as tumour_outfile and optionally normal_outfile. It will be a fully formatted -#' allele counts file as returned by alleleCounter. -#' @param vcf_infile The vcf file to read in -#' @param tumour_outfile File to save the tumour counts to -#' @param normal_outfile Optional parameter specifying where the normal output should go -#' @param refence_genome Optional parameter specifying the reference genome build used -#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the VCF -#' @author sd11 -#' @export -dumpCounts.Muse = function(vcf_infile, tumour_outfile, normal_outfile=NA, refence_genome="hg19", samplename=NA) { - dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="muse", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename) -} - -#' Dump allele counts from vcf - ICGC pancancer consensus SNV pipeline -#' -#' Dump allele counts stored in the info column of the VCF file. Output will go into a file -#' supplied as tumour_outfile. It will be a fully formatted allele counts file as returned -#' by alleleCounter. There are no counts for the matched normal. -#' @param vcf_infile The vcf file to read in -#' @param tumour_outfile File to save the tumour counts to -#' @param normal_outfile Optional parameter specifying where the normal output should go -#' @param refence_genome Optional parameter specifying the reference genome build used -#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the VCF -#' @author sd11 -#' @export -dumpCounts.ICGCconsensusSNV = function(vcf_infile, tumour_outfile, normal_outfile=NA, refence_genome="hg19", samplename=NA) { - dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="icgc_consensus_snv", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename) -} - -#' Dump allele counts from vcf - ICGC pancancer consensus indel pipeline -#' -#' Dump allele counts stored in the info column of the VCF file. Output will go into a file -#' supplied as tumour_outfile. It will be a fully formatted allele counts file as returned -#' by alleleCounter. There are no counts for the matched normal. -#' @param vcf_infile The vcf file to read in -#' @param tumour_outfile File to save the tumour counts to -#' @param refence_genome Optional parameter specifying the reference genome build used -#' @author sd11 -#' @export -dumpCounts.ICGCconsensusIndel = function(vcf_infile, tumour_outfile, refence_genome="hg19", dummy_alt_allele=NA, dummy_ref_allele=NA) { - dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="icgc_consensus_indel", normal_outfile=NA, refence_genome=refence_genome, samplename=NA, dummy_alt_allele=dummy_alt_allele, dummy_ref_allele=dummy_ref_allele) -} - -#' Dump allele counts from vcf - Mutect -#' -#' Dump allele counts stored in the info column of the VCF file. Output will go into a file -#' supplied as tumour_outfile. It will be a fully formatted allele counts file as returned -#' by alleleCounter. -#' @param vcf_infile The vcf file to read in -#' @param tumour_outfile File to save the tumour counts to -#' @param samplename Parameter specifying the samplename to be used for matching the right column in the VCF -#' @param normal_outfile Optional parameter specifying where the normal output should go -#' @param refence_genome Optional parameter specifying the reference genome build used -#' @author sd11 -#' @export -dumpCounts.mutect = function(vcf_infile, tumour_outfile, samplename, normal_outfile=NA, refence_genome="hg19") { - dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="mutect", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename) -} - -#' Dump allele counts from VCF -#' -#' This function implements all the steps required for dumping counts from VCF -#' as supplied by the ICGC pancancer pipelines. -#' @noRd -dumpCountsFromVcf = function(vcf_infile, tumour_outfile, centre, normal_outfile=NA, refence_genome="hg19", samplename=NA, dummy_alt_allele=NA, dummy_ref_allele=NA) { - # Helper function for writing the output - write.output = function(output, output_file) { - write.table(output, file=output_file, col.names=T, quote=F, row.names=F, sep="\t") - } - - # Read in the vcf and dump the tumour counts in the right format - v = VariantAnnotation::readVcf(vcf_infile, refence_genome) - tumour = getCountsTumour(v, centre=centre, samplename=samplename, dummy_alt_allele=dummy_alt_allele, dummy_ref_allele=dummy_ref_allele) - tumour = formatOutput(tumour, v) - write.output(tumour, tumour_outfile) - - # Optionally dump the normal counts in the right format - if (!is.na(normal_outfile)) { - normal = getCountsNormal(v, centre=centre, samplename=samplename) - normal = formatOutput(normal, v) - write.output(normal, normal_outfile) - } -} - -#' Format a 4 column counts table into the alleleCounter format. This function assumes A, C, G, T format. -#' @noRd -formatOutput = function(counts_table, v) { - output = data.frame(as.character(seqnames(v)), start(ranges(v)), counts_table, rowSums(counts_table)) - colnames(output) = c("#CHR","POS","Count_A","Count_C","Count_G","Count_T","Good_depth") - return(output) -} - -#' Dump allele counts from vcf for normal -#' -#' Returns an allele counts table for the normal sample -#' @param v The vcf file -#' @param centre The sequencing centre of which pipeline the vcf file originates -#' @return An array with 4 columns: Counts for A, C, G, T -#' @author sd11 -#' @noRd -getCountsNormal = function(v, centre="sanger", samplename=NA) { - if (centre=="sanger") { - return(getAlleleCounts.Sanger(v, 1)) - } else if(centre=="dkfz") { - sample_col = which(colnames(VariantAnnotation::geno(v)$DP4) == "CONTROL") - return(getAlleleCounts.DKFZ(v, sample_col)) - } else if (centre=="muse") { - # Assuming the tumour name is provided - sample_col = which(colnames(VariantAnnotation::geno(v)$AD) != samplename) - return (getAlleleCounts.MuSE(v, sample_col)) - } else if (centre=="broad") { - print("The Broad ICGC pipeline does not report allele counts for the matched normal") - q(save="no", status=1) - } else if (centre=="icgc_consensus_snv") { - print("The ICGC consensus pipeline does not report allele counts for the matched normal") - q(save="no", status=1) - } else if (centre=="icgc_consensus_indel") { - print("The ICGC consensus indel pipeline does not report allele counts for the matched normal") - q(save="no", status=1) - } else if (centre=="mutect") { - sample_col = which(colnames(VariantAnnotation::geno(v)$AD) == samplename) - return(getAlleleCounts.mutect(v, sample_col)) - } else { - print(paste("Supplied centre not supported:", centre)) - q(save="no", status=1) - } -} - -#' Dump allele counts from vcf for tumour -#' -#' Returns an allele counts table for the tumour sample -#' @param v The vcf file -#' @param centre The sequencing centre of which pipeline the vcf file originates -#' @param samplename Samplename to be used when matching sample columns (Default: NA) -#' @param dummy_alt_allele Supply if the alt allele is to be overwritten. The counts of the alt will be put in the column corresponding to this allele. (Default: NA) -#' @param dummy_ref_allele Supply if the ref allele is to be overwritten. The counts of the ref will be put in the column corresponding to this allele. (Default: NA) -#' @return An array with 4 columns: Counts for A, C, G, T -#' @author sd11 -#' @noRd -getCountsTumour = function(v, centre="sanger", samplename=NA, dummy_alt_allele=NA, dummy_ref_allele=NA) { - if (centre=="sanger") { - return(getAlleleCounts.Sanger(v, 2)) - } else if (centre=="dkfz") { - sample_col = which(colnames(VariantAnnotation::geno(v)$DP4) == "TUMOR") - return(getAlleleCounts.DKFZ(v, sample_col)) - } else if (centre=="muse") { - if (is.na(samplename)) { stop("When dumping allele counts from the Muse samplename must be supplied") } - sample_col = which(colnames(VariantAnnotation::geno(v)$AD) == samplename) - return (getAlleleCounts.MuSE(v, sample_col)) - } else if (centre=="broad") { - if (is.na(samplename)) { stop("When dumping allele counts from the Broad ICGC pipeline samplename must be supplied") } - return(getAlleleCounts.Broad(v, 1)) - } else if (centre=="icgc_consensus_snv") { - return(getAlleleCounts.ICGC_consensus_snv(v)) - } else if (centre=="icgc_consensus_indel") { - if (is.na(dummy_alt_allele) | is.na(dummy_ref_allele)) { stop("When dumping allele counts from the ICGC consensus indels dummy_alt_allele and dummy_ref_allele must be supplied") } - return(getAlleleCounts.ICGC_consensus_indel(v, dummy_alt_allele=dummy_alt_allele, dummy_ref_allele=dummy_ref_allele)) - } else if (centre=="mutect") { - if (is.na(samplename)) { stop("When dumping allele counts from the Mutect samplename must be supplied") } - sample_col = which(colnames(VariantAnnotation::geno(v)$AD) == samplename) - return(getAlleleCounts.mutect(v, sample_col)) - } else { - print(paste("Supplied centre not supported:", centre)) - q(save="no", status=1) - } -} - -#' Dump allele counts from Sanger pipeline vcf -#' -#' Helper function that dumps the allele counts from a Sanger pipeline VCF file -#' @param v The vcf file -#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1 -#' @return An array with 4 columns: Counts for A, C, G, T -#' @author sd11 -#' @noRd -getAlleleCounts.Sanger = function(v, sample_col) { - return(cbind(VariantAnnotation::geno(v)$FAZ[,sample_col]+VariantAnnotation::geno(v)$RAZ[,sample_col], - VariantAnnotation::geno(v)$FCZ[,sample_col]+VariantAnnotation::geno(v)$RCZ[,sample_col], - VariantAnnotation::geno(v)$FGZ[,sample_col]+VariantAnnotation::geno(v)$RGZ[,sample_col], - VariantAnnotation::geno(v)$FTZ[,sample_col]+VariantAnnotation::geno(v)$RTZ[,sample_col])) -} - -#' Dump allele counts from the DKFZ pipeline -#' -#' Helper function that takes a sample column and fetches allele counts. As the DKFZ pipeline does not -#' provide counts for each base, but just the alt and reference, we will provide just those and the -#' other bases with 0s. -#' -#' Note: If there are multiple ALT alleles this function will only take the first mentioned! -#' @param v The vcf file -#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1 -#' @return An array with 4 columns: Counts for A, C, G, T -#' @author sd11 -#' @noRd -getAlleleCounts.DKFZ = function(v, sample_col) { - # Fetch counts for both forward and reverse ref/alt - counts = VariantAnnotation::geno(v)$DP4[,sample_col,] - counts.ref = counts[,1] + counts[,2] # ref forward/reverse - counts.alt = counts[,3] + counts[,4] # alt forward/reverse - allele.ref = as.character(VariantAnnotation::ref(v)) - allele.alt = unlist(lapply(VariantAnnotation::alt(v), function(x) { as.character(x[[1]]) })) - - output = construct_allelecounter_table(count.ref, count.alt, allele.ref, allele.alt) - return(output) -} - -#' Dump allele counts from the MuSE pipeline -#' -#' Helper function that takes a sample column and fetches allele counts. As the MuSE pipeline does not -#' provide counts for each base, but just the alt and reference, we will provide just those and the -#' other bases with 0s. -#' -#' Note: If there are multiple ALT alleles this function will only take the first mentioned! -#' Note2: This function assumes there are only two columns with read counts. The format allows for more -#' @param v The vcf file -#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1 -#' @return An array with 4 columns: Counts for A, C, G, T -#' @author sd11 -#' @noRd -getAlleleCounts.MuSE = function(v, sample_col) { - if (length(colnames(VariantAnnotation::geno(v)$AD)) > 2) { - print("In getAlleleCounts.MuSE: Assuming 2 columns with read counts, but found more. This is not supported") - q(save="no", status=1) - } - - # An SNV can be represented by more than 1 alt alleles, here we pick the alt allele with the highest read count - num.snvs = nrow(VariantAnnotation::geno(v)$AD) - counts = array(NA, c(num.snvs, 2)) - allele.alt = array(NA, num.snvs) - for (i in 1:num.snvs) { - snv.counts = unlist(VariantAnnotation::geno(v)$AD[i,sample_col]) - counts[i,1] = snv.counts[1] # The reference is the first base for which read counts are mentioned - select_base = which.max(snv.counts[2:length(snv.counts)]) - allele.alt[i] = as.character(VariantAnnotation::alt(v)[[i]][select_base]) - select_base = select_base+1 # The reference is the first base for which read counts are mentioned - counts[i,2] = snv.counts[select_base] - } - - allele.ref = as.character(VariantAnnotation::ref(v)) - - output = construct_allelecounter_table(counts[i,1], counts[i,2], allele.ref, allele.alt) - return(output) -} - -#' Dump allele counts from the Broad pipeline -#' -#' Helper function that takes a sample column and fetches allele counts. As the Broad pipeline does not -#' provide counts for each base, but just the alt and reference, we will provide just those and the -#' other bases with 0s. -#' -#' Note: If there are multiple ALT alleles this function will only take the first mentioned! -#' @param v The vcf file -#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1 -#' @return An array with 4 columns: Counts for A, C, G, T -#' @author sd11 -#' @noRd -getAlleleCounts.Broad = function(v, sample_col) { - count.ref = as.numeric(unlist(VariantAnnotation::geno(v)$ref_count)) - count.alt = as.numeric(unlist(VariantAnnotation::geno(v)$alt_count)) - allele.ref = as.character(VariantAnnotation::ref(v)) - allele.alt = unlist(lapply(VariantAnnotation::alt(v), function(x) { as.character(x[[1]]) })) - - output = construct_allelecounter_table(count.ref, count.alt, allele.ref, allele.alt) - return(output) -} - -#' Dump allele counts in ICGC consensus SNV format -#' -#' This function fetches allele counts from the info field in the VCF file. -#' -#' @param v The vcf file -#' @return An array with 4 columns: Counts for A, C, G, T -#' @author sd11 -#' @noRd -#' Note: If there are multiple ALT alleles this function will only take the first mentioned! -getAlleleCounts.ICGC_consensus_snv = function(v) { - count.alt = info(v)$t_alt_count - count.ref = info(v)$t_ref_count - allele.ref = as.character(VariantAnnotation::ref(v)) - allele.alt = unlist(lapply(VariantAnnotation::alt(v), function(x) { as.character(x[[1]]) })) - - output = construct_allelecounter_table(count.ref, count.alt, allele.ref, allele.alt) - return(output) -} - -#' Dump allele counts in ICGC consensus SNV format -#' -#' This function fetches allele counts from the info field in the VCF file. -#' -#' @param v The vcf file -#' @param dummy_alt_allele Dummy base to use to encode the alt allele, the counts will be saved in the corresponding column -#' @param dummy_ref_allele Dummy base to use to encode the ref allele, the counts will be saved in the corresponding column -#' @return An array with 4 columns: Counts for A, C, G, T -#' @author sd11 -#' @noRd -#' Note: If there are multiple ALT alleles this function will only take the first mentioned! -getAlleleCounts.ICGC_consensus_indel = function(v, dummy_alt_allele="A", dummy_ref_allele="C") { - c = construct_allelecounter_table(count.ref=info(v)$t_ref_count, - count.alt=info(v)$t_alt_count, - allele.ref=rep(dummy_ref_allele, nrow(v)), - allele.alt=rep(dummy_alt_allele, nrow(v))) - return(c) -} - -#' Dump allele counts in Mutect format -#' -#' This function fetches allele counts from the info field in the VCF file. -#' -#' @param v The vcf file -#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1 -#' @return An array with 4 columns: Counts for A, C, G, T -#' @author sd11 -#' @noRd -#' Note: If there are multiple ALT alleles this function will only take the first mentioned! -getAlleleCounts.mutect = function(v, sample_col) { - counts = do.call(rbind, geno(v)$AD[,sample_col]) - count.ref = counts[,1] - count.alt = counts[,2] - allele.ref = as.character(VariantAnnotation::ref(v)) - allele.alt = unlist(lapply(VariantAnnotation::alt(v), function(x) { as.character(x[[1]]) })) - output = construct_allelecounter_table(count.ref, count.alt, allele.ref, allele.alt) - return(output) -} - -#' Function that constructs a table in the format of the allele counter -#' @param count.ref Number of reads supporting the reference allele -#' @param count.alt Number of reads supporting the variant allele -#' @param allele.ref The reference allele -#' @param allele.alt The variant allele -#' @return A data.frame consisting of four columns: Reads reporting A, C, G and T -#' @author sd11 -#' @noRd -construct_allelecounter_table = function(count.ref, count.alt, allele.ref, allele.alt) { - output = array(0, c(length(allele.ref), 4)) - nucleotides = c("A", "C", "G", "T") - # Propagate the alt allele counts - nucleo.index = match(allele.alt, nucleotides) - for (i in 1:nrow(output)) { - output[i,nucleo.index[i]] = count.alt[i] - } - - # Propagate the reference allele counts - nucleo.index = match(allele.ref, nucleotides) - for (i in 1:nrow(output)) { - output[i,nucleo.index[i]] = count.ref[i] - } - return(output) -} - ############################################ # MUT 2 MUT phasing ############################################ @@ -1001,6 +517,18 @@ addYchromToBattenberg = function(subclone.data) { #' Main function that creates the DP input file. A higher level function should be called by users #' @noRd GetDirichletProcessInfo<-function(outputfile, cellularity, info, subclone.file, is.male = F, out.dir = NULL, SNP.phase.file = NULL, mut.phase.file = NULL, adjust_male_y_chrom=F){ + + write_output = function(info, outputfile) { + # convert GenomicRanges object to df + df = data.frame(chr=as.data.frame(seqnames(info)), + start=start(info)-1, + end=end(info)) + df = cbind(df, as.data.frame(elementMetadata(info))) + colnames(df)[1] = "chr" + df = df[with(df, order(chr)),] + print(head(df)) + write.table(df, outputfile, sep="\t", row.names=F, quote=F) + } subclone.data = read.table(subclone.file,sep="\t",header=T,stringsAsFactors=F) # Add in the Y chrom if donor is male and Battenberg hasn't supplied it (BB returns X/Y ad multiple copies of X for men) @@ -1037,14 +565,22 @@ GetDirichletProcessInfo<-function(outputfile, cellularity, info, subclone.file, if(is.male & "chr" %in% names(info)){ normal.CN = rep(2,nrow(info)) normal.CN[info$chr=="X"| info$chr=="Y"] = 1 - info$mutation.copy.number = mutationBurdenToMutationCopyNumber(info$mut.count/ (info$mut.count + info$WT.count) , info$subclonal.CN, cellularity, normal.CN) + info$mutation.copy.number = dpclust3p:::mutationBurdenToMutationCopyNumber(info$mut.count/ (info$mut.count + info$WT.count) , info$subclonal.CN, cellularity, normal.CN) }else{ - info$mutation.copy.number = mutationBurdenToMutationCopyNumber(info$mut.count/ (info$mut.count + info$WT.count) , info$subclonal.CN, cellularity) + info$mutation.copy.number = dpclust3p:::mutationBurdenToMutationCopyNumber(info$mut.count/ (info$mut.count + info$WT.count) , info$subclonal.CN, cellularity) } # convert MCN to subclonal fraction - tricky for amplified mutations info$subclonal.fraction = info$mutation.copy.number - expected.burden.for.MCN = mutationCopyNumberToMutationBurden(rep(1,length(info)),info$subclonal.CN,cellularity) + expected.burden.for.MCN = dpclust3p:::mutationCopyNumberToMutationBurden(rep(1,length(info)),info$subclonal.CN,cellularity) + info$no.chrs.bearing.mut = NA + + # Check if any SNVs have a copy number state, if there are none we can stop here + if (all(is.na(info$nMaj1))) { + write_output(info, outputfile) + return() + } + non.zero.indices = which(info$mut.count>0 & !is.na(expected.burden.for.MCN)) #test for mutations in more than 1 copy p.vals = sapply(1:length(non.zero.indices),function(v,e,i){ @@ -1152,15 +688,7 @@ GetDirichletProcessInfo<-function(outputfile, cellularity, info, subclone.file, info$no.chrs.bearing.mut[possible.zero.muts[del.indices]] = 0 } - # convert GenomicRanges object to df - df = data.frame(chr=as.data.frame(seqnames(info)), - start=start(info)-1, - end=end(info)) - df = cbind(df, as.data.frame(elementMetadata(info))) - colnames(df)[1] = "chr" - df = df[with(df, order(chr)),] - print(head(df)) - write.table(df, outputfile, sep="\t", row.names=F, quote=F) + write_output(info, outputfile) } #' Convenience function to load the cellularity from a rho_and_psi file diff --git a/R/qualitycontrol.R b/R/qualitycontrol.R index 9c898b8..53ed8fe 100644 --- a/R/qualitycontrol.R +++ b/R/qualitycontrol.R @@ -1,6 +1,11 @@ ############################################## # QC ############################################## +#' Create a PNG file +#' @param p The figure +#' @param filename Where to save the image +#' @param width Width +#' @param height Height #' @export createPng <- function(p, filename, width, height) { png(filename=filename, width=width, height=height) @@ -8,7 +13,8 @@ createPng <- function(p, filename, width, height) { dev.off() } -#' @export +#' Convenience function used for creating figures +#' @noRd meltFacetPlotData <- function(data, subsamplenames) { d = as.data.frame(data) colnames(d) = subsamplenames @@ -16,95 +22,56 @@ meltFacetPlotData <- function(data, subsamplenames) { return(data.m) } -#' @export +#' Template to create histograms +#' @noRd createHistFacetPlot <- function(data, title, xlab, ylab, binwidth) { p = ggplot2::ggplot(data) + ggplot2::aes(x=value, y=..count..) + ggplot2::geom_histogram(binwidth=binwidth, colour="black", fill="gray") + ggplot2::facet_grid(variable ~ .) p = p + ggplot2::theme_bw(base_size=35) + ggplot2::ggtitle(title) + ggplot2::xlab(xlab) + ggplot2::ylab(ylab) return(p) } +#' Template to create box plots +#' @noRd createBoxFacetPlot <- function(data, title, xlab, ylab) { p = ggplot2::ggplot(data) + ggplot2::aes(x=variable, y=value) + ggplot2::geom_boxplot() + ggplot2::facet_grid(subsample ~ .) p = p + ggplot2::theme_bw() + ggplot2::ggtitle(title) + ggplot2::xlab(xlab) + ggplot2::ylab(ylab) return(p) } -createQCDocument <- function(res, samplename, subsamplenames, outpath, cellularity) { - p = createHistFacetPlot(meltFacetPlotData(res$totalCopyNumber, subsamplenames), paste(samplename, "totalCopyNumber"), "Copynumber", "Count", binwidth=1) - createPng(p, paste(outpath, samplename, "_totalCopyNumber.png", sep=""), width=1500, height=500*length(subsamplenames)) - - p = createHistFacetPlot(meltFacetPlotData(res$mutation.copy.number, subsamplenames), paste(samplename, "mutation.copy.number"), "mutation.copy.number", "Count", binwidth=0.1) +#' Plot mutation copy number histogram +#' +#' @param data A DPClust input table +#' @param samplename Name of the sample +#' @param outdir Directory where the figure is to be stored +#' @author sd11 +#' @export +plot_mcn_hist = function(data, samplename, outdir) { + p = createHistFacetPlot(meltFacetPlotData(data$mutation.copy.number, samplename), paste(samplename, "mutation.copy.number"), "mutation.copy.number", "Count", binwidth=0.1) p = p + ggplot2::xlim(0,5) - createPng(p, paste(outpath, samplename, "_mutation.copy.number.png", sep=""), width=1500, height=500*length(subsamplenames)) - - p = createHistFacetPlot(meltFacetPlotData(res$copyNumberAdjustment, subsamplenames), paste(samplename, "copyNumberAdjustment"), "copyNumberAdjustment", "Count (log10)", binwidth=1) - p = p + ggplot2::scale_y_log10() - createPng(p, paste(outpath, samplename, "_copyNumberAdjustment.png", sep=""), width=1500, height=500*length(subsamplenames)) - - p = createHistFacetPlot(meltFacetPlotData(res$mutCount/(res$mutCount+res$WTCount), subsamplenames), paste(samplename, "alleleFrequency"), "Allele Frequency", "Count", binwidth=0.01) - createPng(p, paste(outpath, samplename, "_alleleFrequency.png", sep=""), width=1500, height=500*length(subsamplenames)) - - p = createHistFacetPlot(meltFacetPlotData(res$kappa, subsamplenames), paste(samplename, "kappa"), "Kappa", "Count", binwidth=0.01) - createPng(p, paste(outpath, samplename, "_kappa.png", sep=""), width=1500, height=500*length(subsamplenames)) - # p = createHistFacetPlot(meltFacetPlotData(res$subclonal.fraction, subsamplenames), paste(samplename, "subclonal.fraction"), "Subclonal fraction", "Count", binwidth=0.01) - # createPng(p, paste(outpath, samplename, "_subclonal.fraction.png", sep=""), width=1500, height=500*length(subsamplenames)) - - p = createHistFacetPlot(meltFacetPlotData((res$mutCount+res$WTCount), subsamplenames), paste(samplename, "depth"), "Depth", "Count", binwidth=5) - createPng(p, paste(outpath, samplename, "_depth.png", sep=""), width=1500, height=500*length(subsamplenames)) - - # fractionOfCells = res$mutation.copy.number / res$copyNumberAdjustment - # meltFacetPlotData(fractionOfCells, subsamplenames) - p = createHistFacetPlot(meltFacetPlotData(res$subclonal.fraction, subsamplenames), paste(samplename, "Fraction Of Cells"), "Fraction of Cells", "Count", binwidth=0.05) - p = p + ggplot2::geom_vline(xintercept=0.5, colour="red", linetype="longdash", size=2) + ggplot2::geom_vline(xintercept=1.5, colour="red", linetype="longdash", size=2) + ggplot2::xlim(0,3) #scale_y_log10() - createPng(p, paste(outpath, samplename, "_fractionOfCells.png", sep=""), width=1500, height=500*length(subsamplenames)) - - # manualMutCopyNum = mutationBurdenToMutationCopyNumber(res$mutCount/(res$mutCount+res$WTCount),res$totalCopyNumber ,cellularity) - # p = createHistFacetPlot(meltFacetPlotData(manualMutCopyNum, subsamplenames), paste(samplename, "manualMutCopyNum"), "Mutation copy number", "Count", binwidth=0.1) - # createPng(p, paste(outpath, samplename, "_manualMutCopyNum.png", sep=""), width=500, height=250*length(subsamplenames)) - # - # manualFractionOfCells = mutationBurdenToMutationCopyNumber(res$mutCount/(res$mutCount+res$WTCount),res$totalCopyNumber ,cellularity) / res$copyNumberAdjustment - # p = createHistFacetPlot(meltFacetPlotData(manualFractionOfCells, subsamplenames), paste(samplename, "manualFractionOfCells"), "Fraction of Cells", "Count", binwidth=0.01) - # createPng(p, paste(outpath, samplename, "_manualFractionOfCells.png", sep=""), width=1500, height=500*length(subsamplenames)) - - # Manually melt the data - d.m = data.frame() - for (i in 1:length(subsamplenames)) { - d.loc = data.frame(subsample=subsamplenames[i],variable=factor(dataset$chromosome[,i]), value=dataset$subclonal.fraction[,i]) - d.m = rbind(d.m, d.loc) - } - p = createBoxFacetPlot(d.m, paste(samplename, "subclonal fraction per chrom"), "Chromosome", "Subclonal fraction") - createPng(p, paste(outpath, samplename, "_subclonalFractionPerChromosome.png", sep=""), width=1500, height=500*length(subsamplenames)) - - # Manually melt the data - d = as.data.frame(dataset$chromosome) - colnames(d) = subsamplenames - d.m = data.frame() - for (i in 1:ncol(d)) { - d.loc = d[dataset$subclonal.fraction > 1.5,i] - if(length(d.loc) > 0) { - d.m = rbind(d.m, data.frame(variable=colnames(d)[i], value=factor(d.loc))) - } - } - - if (nrow(d.m) > 0) { - p = createHistFacetPlot(d.m, paste(samplename, "subclonal fraction > 1.5"), "Chromosome", "Count", binwidth=1) - createPng(p, paste(outpath, samplename, "_large.subclonal.fraction.by.chrom.png", sep=""), width=1500, height=500*length(subsamplenames)) - } + createPng(p, file.path(outpath, paste(samplename, "_mutation.copy.number.png", sep="")), width=1500, height=500) } -#' -#' Run the QC on specified data -#' -runQc = function(infile, datpath, outpath) { - sample2purity = read.table(infile, header=T, stringsAsFactors=F) - for (samplename in unique(sample2purity$sample)) { - print(samplename) - datafiles = sample2purity[sample2purity$sample==samplename,]$datafile - subsamples = sample2purity[sample2purity$sample==samplename,]$subsample - cellularity = sample2purity[sample2purity$sample==samplename,]$cellularity - if (file.exists(paste(datpath,datafiles, sep=""))) { - dataset = load.data(datpath,"",datafiles, cellularity=cellularity, Chromosome="chr", position="end", WT.count="WT.count", mut.count="mut.count", subclonal.CN="subclonal.CN", no.chrs.bearing.mut="no.chrs.bearing.mut", mutation.copy.number="mutation.copy.number", subclonal.fraction="subclonal.fraction", data_file_suffix="") - createQCDocument(dataset, samplename, subsamples, outpath, cellularity) - } - } +#' Plot cancer cell fraction histogram +#' +#' @param data A DPClust input table +#' @param samplename Name of the sample +#' @param outdir Directory where the figure is to be stored +#' @author sd11 +#' @export +plot_ccf_hist = function(data, samplename, outdir) { + p = createHistFacetPlot(meltFacetPlotData(data$subclonal.fraction, samplename), paste(samplename, "Fraction Tumour Cells"), "Fraction Tumour Cells", "Count", binwidth=0.05) + p = p + ggplot2::xlim(0,1.5) + createPng(p, file.path(outpath, paste(samplename, "_fractionOfCells.png", sep="")), width=1500, height=500) +} + +#' Plot allele frequency histogram +#' +#' @param data A DPClust input table +#' @param samplename Name of the sample +#' @param outdir Directory where the figure is to be stored +#' @author sd11 +#' @export +plot_vaf_hist = function(data, samplename, outdir) { + p = createHistFacetPlot(meltFacetPlotData(data$mutCount/(data$mutCount+data$WTCount), samplename), paste(samplename, "alleleFrequency"), "Allele Frequency", "Count", binwidth=0.01) + createPng(p, file.path(outpath, paste(samplename, "_alleleFrequency.png", sep="")), width=1500, height=500) } diff --git a/R/util.R b/R/util.R new file mode 100644 index 0000000..bd47882 --- /dev/null +++ b/R/util.R @@ -0,0 +1,72 @@ +#' Concatenate split files +#' +#' Convenience function to concatenate a series of files specified in a file of file names. +#' This function assumes all files have the same layout. +#' @param fofn A file of file names to be concatenated +#' @param inputdir Full path to where the input files are stored +#' @param outfile Full path to where the output should be written +#' @param haveHeader Boolean that specifies whether the input files have a header +#' @author sd11 +#' @export +concat_files = function(fofn, inputdir, outfile, haveHeader) { + + inputdir = paste(inputdir,"/", sep="") + list_of_files = read.table(fofn, stringsAsFactors=F, header=F)[,1] + + output = data.frame() + for (infile in list_of_files) { + infile = paste(inputdir, infile, sep="") + # Check if file is there and it contains data + if (file.exists(infile) & file.info(infile)$size != 0) { + dat = read.delim(infile, header=haveHeader, quote=NULL, stringsAsFactors=F) + output = rbind(output, dat) + } + } + + write.table(output, file=outfile, col.names=haveHeader, row.names=F, sep="\t", quote=F) +} + +#' Split a file per chromosome +#' +#' Convenience function to split an input file per chromosome. All it requires is that +#' the infile has as first column chromosome specification. The output files will be named +#' outdir/prefixCHROMNUMBERpostfix +#' @param infile The file to be split +#' @param prefix Prefix of the output file +#' @param postfix Postfix of the output file +#' @param outdir Directory where the output files are to be written +#' @param chrom_file A simple list of chromosomes to be considered +#' @author sd11 +#' @export +split_by_chrom = function(infile, prefix, postfix, outdir, chrom_file) { + outdir = paste(outdir, "/", sep="") + + # Check if there are lines in the file, otherwise it will crash this script + if (file.info(infile)$size == 0) { + print("No lines in loci file") + q(save="no") + } + + loci = read.delim(infile, stringsAsFactors=F, header=F) + + chroms = read.delim(chrom_file, stringsAsFactors=F, header=F) + + for (i in 1:nrow(chroms)) { + selection = loci[loci[,1]==chroms[i,1],] + chrom_id = chroms[i,2] + write.table(selection, file=paste(outdir, prefix, chrom_id, postfix, sep=""), quote=F, row.names=F, col.names=F, sep="\t") + } +} + +#' Calculate power to call subclones. +#' +#' This function calculates the +#' average number of reads per clonal chromosome copy. +#' @param purity The sample purity +#' @param ploidy The tumour ploidy +#' @param coverage The tumour sample coverage +#' @return The average reads per chromosome copy, a.k.a. power +#' @author sd11 +calc_power = function(purity, ploidy, coverage) { + return(round((purity) / (purity*ploidy + (1-purity)*2) * coverage, 3)) +} \ No newline at end of file diff --git a/inst/todo/adjust_vaf.R b/inst/todo/adjust_vaf.R new file mode 100644 index 0000000..cee900b --- /dev/null +++ b/inst/todo/adjust_vaf.R @@ -0,0 +1,136 @@ +args = commandArgs(T) +infile = args[1] +cellularity = as.numeric(args[2]) + +VAFcorrection <- function(r, N, C = 3){ + phat <- r / N + t0 <- pbinom(C+1,N,phat,lower.tail = F) + t1 <- sum(dbinom((C+1):N,N,phat)*((C+1):N )) + t2 <- sum(dbinom((C+1):N,N,phat)*( (C+1):N)^2 ) + p <- (t2-(C+1)*t1)/( (N-1)*t1-(C)*N*t0 ) + return(list(p=p,phat=phat)) +} + + +dat = read.table(infile, header=T, stringsAsFactors=F) +depth = (dat$mut.count+dat$WT.count) +new_vaf = unlist(lapply(1:nrow(dat), function(i) VAFcorrection(dat$mut.count[i], depth[i])$p)) +dat$mut.count = new_vaf * depth +dat$WT.count = depth - dat$mut.count +dat$mutation.copy.number = dpclust3p::mutationBurdenToMutationCopyNumber(dat$mut.count/ (dat$mut.count + dat$WT.count) , dat$subclonal.CN, cellularity) +dat$subclonal.fraction = dat$mutation.copy.number / dat$no.chrs.bearing.mut + +info = dat + + # convert MCN to subclonal fraction - tricky for amplified mutations + info$subclonal.fraction = info$mutation.copy.number + expected.burden.for.MCN = dpclust3p::mutationCopyNumberToMutationBurden(rep(1,length(info)),info$subclonal.CN,cellularity) + non.zero.indices = which(info$mut.count>0 & !is.na(expected.burden.for.MCN)) + #test for mutations in more than 1 copy + p.vals = sapply(1:length(non.zero.indices),function(v,e,i){ + prop.test(v$mut.count[i],v$mut.count[i] + v$WT.count[i],e[i],alternative="greater")$p.value + },v=info[non.zero.indices,], e=expected.burden.for.MCN[non.zero.indices]) + amplified.muts = non.zero.indices[p.vals<=0.05] + + info$no.chrs.bearing.mut = 1 + + #copy numbers of subclones can only differ by 1 or 0 (as assumed when calling subclones) + if(length(amplified.muts)>0){ + for(a in 1:length(amplified.muts)){ + max.CN2=0 + #use phasing info - if on 'deleted' (lower CN chromosome), use the minor copy number + if(info$phase[amplified.muts[a]]=="MUT_ON_DELETED"){ + print("mut on minor chromosome") + max.CN1 = info$nMin1[amplified.muts[a]] + frac1 = info$frac1[amplified.muts[a]] + frac2=0 + if(!is.na(info$nMin2[amplified.muts[a]])){ + #swap subclones, so that the one with the higher CN is first + if(info$nMin2[amplified.muts[a]]>max.CN1){ + max.CN2 = max.CN1 + max.CN1 = info$nMin2[amplified.muts[a]] + frac2 = frac1 + frac1 = info$frac2[amplified.muts[a]] + }else{ + max.CN2 = info$nMin2[amplified.muts[a]] + frac2 = info$frac2[amplified.muts[a]] + } + } + }else{ + max.CN1 = info$nMaj1[amplified.muts[a]] + frac1 = info$frac1[amplified.muts[a]] + frac2=0 + if(!is.na(info$nMaj2[amplified.muts[a]])){ + #swap subclones, so that the one with the higher CN is first + if(info$nMaj2[amplified.muts[a]]>max.CN1){ + max.CN2 = max.CN1 + max.CN1 = info$nMaj2[amplified.muts[a]] + frac2 = frac1 + frac1 = info$frac2[amplified.muts[a]] + }else{ + max.CN2 = info$nMaj2[amplified.muts[a]] + frac2 = info$frac2[amplified.muts[a]] + } + } + } + best.err = info$mutation.copy.number[amplified.muts[a]] - 1 + best.CN=1 + for(j in 1:max.CN1){ + for(k in (j-1):min(j,max.CN2)){ + potential.CN = j * frac1 + k * frac2 + err = abs(info$mutation.copy.number[amplified.muts[a]]/potential.CN-1) + if(err0){ + for(a in 1:length(subclonal.muts)){ + #if there are no subclonal CNVs, don't adjust subclonal fraction + if(is.na(info$frac2[subclonal.muts[a]])){next} + #assume subclonal muts are on one chromosome copy, therefore mutation copy number must be subclonal fraction of the higher CN subclone (i.e. lost in lower CN subclone) or 1 (i.e. present in both subclones) + if(info$nMaj1[subclonal.muts[a]]+info$nMin1[subclonal.muts[a]] > info$nMaj2[subclonal.muts[a]]+info$nMin2[subclonal.muts[a]]){ + possible.subclonal.fractions = c(info$frac1[subclonal.muts[a]],1) + }else{ + possible.subclonal.fractions = c(info$frac2[subclonal.muts[a]],1) + } + best.CN = possible.subclonal.fractions[which.min(abs(info$mutation.copy.number[subclonal.muts[a]]/possible.subclonal.fractions - 1))] + #extra test 200313 - check whether subclonal CN results in clonal mutation, otherwise subclonal CN doesn't explain subclonal MCN + if(best.CN != 1 & prop.test(info$mut.count[subclonal.muts[a]],info$mut.count[subclonal.muts[a]]+info$WT.count[subclonal.muts[a]],expected.burden.for.MCN[subclonal.muts[a]] * best.CN)$p.value > 0.05){ + info$subclonal.fraction[subclonal.muts[a]] = info$mutation.copy.number[subclonal.muts[a]] / best.CN + info$no.chrs.bearing.mut[subclonal.muts[a]] = best.CN + } + } + } + + possible.zero.muts = intersect((1:length(info))[-non.zero.indices],which(!is.na(info$nMin1))) + possible.zero.muts = c(possible.zero.muts,non.zero.indices[p.vals2>0.05]) + if(length(possible.zero.muts)>0){ + del.indices = which(info$nMin1[possible.zero.muts]==0 & !info$phase[possible.zero.muts]=="MUT_ON_RETAINED") + info$subclonal.fraction[possible.zero.muts[del.indices]] = NA + info$no.chrs.bearing.mut[possible.zero.muts[del.indices]] = 0 + } + +write.table(info, file=basename(infile), quote=F, sep="\t", row.names=F) + diff --git a/inst/todo/adjust_vaf.sh b/inst/todo/adjust_vaf.sh new file mode 100755 index 0000000..9ca251d --- /dev/null +++ b/inst/todo/adjust_vaf.sh @@ -0,0 +1,5 @@ +#!/bin/bash +infile=$1 +samplename=`basename ${infile} | cut -f 1 -d _` +cell=`grep ${samplename} ../dirichlet/conscna.txt | cut -f 4` +Rscript adjust_vaf.R ${infile} ${cell} diff --git a/inst/todo/phasing.R b/inst/todo/phasing.R new file mode 100644 index 0000000..b9826de --- /dev/null +++ b/inst/todo/phasing.R @@ -0,0 +1,274 @@ +# New phasing script to replace the old that used a separate perl script + + +# library(Rsamtools) +# library(VariantAnnotation) +# +# bam_file = "/nfs/users/nfs_c/cgppipe/pancancer/automated/donors/CLLE-ES/151/tumour/ffa976f0-aa60-4867-842e-361afa7d68ac.bam" +# vcf_file = "/nfs/users/nfs_c/cgppipe/pancancer/workspace/sd11/icgc_pancan_full/analysis_sanger_001/variants/ffa976f0-aa60-4867-842e-361afa7d68ac.svcp_1-0-4.20150202.somatic.snv_mnv.vcf.gz" + +mut_mut_phasing = function(loci_file, phased_file, bam_file, bai_file, max_distance) { + # TODO: this must be removed + chr.names = c(1:22,"X","Y") + + muts <- read.delim(loci_file, header=F, row.names=NULL, stringsAsFactors=F) + names(muts) = c("CHR","POSITION","WT","MT") + muts = muts[order(match(muts$CHR,chr.names),muts$POSITION),] + + # Pairwise comparison of all muts, only take those that are close to eachother + output <- data.frame(Chr = vector(mode="character",length=0), Pos1 = vector(mode="numeric",length=0), Ref1 = vector(mode="character",length=0), Var1 = vector(mode="character",length=0), Pos2 = vector(mode="numeric",length=0), Ref2 = vector(mode="character",length=0), Var2 = vector(mode="character",length=0)) + for(chr in chr.names){ + chr.muts = muts[muts$CHR==chr,] + no.muts = nrow(chr.muts) + for(i in 1:(no.muts-1)){ + dist = chr.muts$POSITION[(i+1):no.muts] - chr.muts$POSITION[i] + inds = which(dist <= max_distance) # 700 + if(length(inds)>0){ + output <- rbind(output,data.frame(Chr = chr, Pos1 = chr.muts$POSITION[i], Ref1 = chr.muts$WT[i], Var1 = chr.muts$MT[i], Pos2 = chr.muts$POSITION[i+inds], Ref2 = chr.muts$WT[i+inds], Var2 = chr.muts$MT[i+inds])) + } + } + } + + + if (nrow(output) > 0) { + + count.data = data.frame() + + for (i in 1:nrow(output)) { + chrom = output$Chr[i] + dat_pair = data.frame(chrom=rep(output$Chr[i],2), start=c(output$Pos1[i], output$Pos2[i]), end=c(output$Pos1[i], output$Pos2[i])) + dat_pair = makeGRangesFromDataFrame(dat_pair) + + wt_wt = paste(output$Ref1[i], output$Ref2[i], sep="_") + wt_mut = paste(output$Var1[i], output$Ref2[i], sep="_") + mut_wt = paste(output$Ref1[i], output$Var2[i], sep="_") + mut_mut = paste(output$Var1[i], output$Var2[i], sep="_") + + flag = scanBamFlag(isPaired=T, hasUnmappedMate=F, isDuplicate=F, isUnmappedQuery=F) #, isProperPair=T + param <- ScanBamParam(which=dat_pair[1,], what=scanBamWhat(), flag=flag) + bamfile <- BamFile(bam_file, asMates=TRUE) + bam <- scanBam(bamfile, param=param)[[1]] + + alleles = get_allele_combination_counts(bam, dat_pair) + + #' SNV to SNV + allele_pairs = table(factor(paste(alleles$snv1, alleles$snv2, sep="_"), levels=c(wt_wt, mut_mut, wt_mut, mut_wt))) + count.data = rbind(count.data, data.frame(output[i,], + Num_WT_WT=allele_pairs[[wt_wt]], + Num_Mut_Mut=allele_pairs[[mut_mut]], + Num_Mut_WT=allele_pairs[[mut_wt]], + Num_WT_Mut=allele_pairs[[wt_mut]])) + } + + # Categorise pairs of mutations + count.data$phasing = NA + for(h in 1:nrow(count.data)){ + counts = count.data[h,8:11] + print(counts) + if(counts[2]>0 & counts[3]+counts[4] == 0){ + count.data$phasing[h]="phased" + #}else if(counts[2]==0 & counts[3]+counts[4] > 0){ + #require BOTH WT-mut and mut-WT pairs + }else if(counts[2]==0 & counts[3]>0 & counts[4]>0){ + count.data$phasing[h]="anti-phased" + }else if(counts[2]>0 && counts[3]>0 && counts[4]==0){ + count.data$phasing[h]="clone-subclone" + }else if(counts[2]>0 && counts[3]==0 && counts[4]>0){ + count.data$phasing[h]="subclone-clone" + } + } + + write.table(count.data[!is.na(count.data$phasing),],phased_file,sep="\t",quote=F,row.names=F) + } +} + + +mut_cn_phasing = function(loci_file, phased_file, hap_file, bam_file, bai_file, outfile, max_distance) { + + if (file.info(loci_file)$size == 0) { + linked.muts = data.frame(matrix(rep(NA, 13), nrow=1)) + colnames(linked.muts) = c("Chr","Pos1","Ref1","Var1","Pos2","Ref2","Var2","AF","AFphased","Num_linked_to_A","Num_linked_to_C","Num_linked_to_G","Num_linked_to_T") + linked.muts = na.omit(linked.muts) + } else if(nrow(read.delim(loci_file, header=F, stringsAsFactors=F, fill=T))==0) { + linked.muts = data.frame(matrix(rep(NA, 13), nrow=1)) + colnames(linked.muts) = c("Chr","Pos1","Ref1","Var1","Pos2","Ref2","Var2","AF","AFphased","Num_linked_to_A","Num_linked_to_C","Num_linked_to_G","Num_linked_to_T") + linked.muts = na.omit(linked.muts) + } else { + # TODO: is this filtering required when just supplying loci files from a single chromosome? + chr.muts = read.delim(loci_file, header=F, stringsAsFactors=F, fill=T) + names(chr.muts) = c("CHR","POSITION","REF_BASE","MUT_BASE") + + # Match phased SNPs and their haplotypes together + phased = read.delim(phased_file, header=T, stringsAsFactors=F, quote="\"") + # Compatible with both BB setups that have row.names and those that don't + if (ncol(phased) == 6) { + colnames(phased) = c("SNP", "Chr","Pos", "AF", "AFphased", "AFsegmented") + } else { + colnames(phased) = c("Chr","Pos", "AF", "AFphased", "AFsegmented") + } + + # TODO: check that chromosomes are using the same names between loci and phased files + phased = phased[phased$Chr %in% chr.muts$CHR,] + + hap.info = read.delim(hap_file, sep=" ", header=F, row.names=NULL, stringsAsFactors=F) + # Compatible with both BB setups that have row.names and those that don't + if (ncol(hap.info) == 7) { + colnames(hap.info) = c("SNP","dbSNP","pos","ref","mut","ref_count","mut_count") + } else { + colnames(hap.info) = c("dbSNP","pos","ref","mut","ref_count","mut_count") + } + # get haplotypes that match phased heterozygous SNPs + hap.info = hap.info[match(phased$Pos,hap.info$pos),] + + # Synchronise dfs in case some SNPs are not in hap.info + selection = !is.na(hap.info$pos) + hap.info = hap.info[selection,] + phased = phased[selection,] + + #220212 + #phased$AF[hap.info$ref_count==1] = 1-phased$AF[hap.info$ref_count==1] + phased$Ref = hap.info$ref + phased$Var = hap.info$mut + + # Annotate the chr.muts df with the min abs distance to a phased SNP + chr.muts$dist = sapply(1:dim(chr.muts)[1], function(i,p,m) min(abs(p$Pos - m$POSITION[i])), p=phased, m=chr.muts) + chr.muts$snp.index = sapply(1:dim(chr.muts)[1], function(i,p,m) which.min(abs(p$Pos - m$POSITION[i])), p=phased, m=chr.muts) + # Use only those to a SNP + muts <- chr.muts[chr.muts$dist < max_distance,] #700 + snps <- phased[muts$snp.index,] + + # linked.muts = run_linkage_pull_snp(loci_file, bam_file, bai_file, muts$CHR, muts$POSITION, muts$REF_BASE, muts$MUT_BASE, snps$Pos, snps$Ref, snps$Var, snps$AF, snps$AFphased) + + output = data.frame(Chr=muts$CHR, + Pos1=muts$POSITION, + Ref1=muts$REF_BASE, + Var1=muts$MUT_BASE, + Pos2=snps$Pos, + Ref2=snps$Ref, + Var2=snps$Var, + AF=snps$AF, + AFphased=snps$AFphased) + + linked.muts = data.frame() + for (i in 1:nrow(output)) { + + chrom = output$Chr[i] + dat_pair = data.frame(chrom=rep(output$Chr[i],2), start=c(output$Pos1[i], output$Pos2[i]), end=c(output$Pos1[i], output$Pos2[i])) + dat_pair = makeGRangesFromDataFrame(dat_pair) + + flag = scanBamFlag(isPaired=T, hasUnmappedMate=F, isDuplicate=F, isUnmappedQuery=F) #, isProperPair=T + param <- ScanBamParam(which=dat_pair[1,], what=scanBamWhat(), flag=flag) + bamfile <- BamFile(bam_file, asMates=TRUE) + bam <- scanBam(bamfile, param=param)[[1]] + + count.data = get_allele_combination_counts(bam, dat_pair) + #' SNV to SNP + alleles_snv_ref = count.data[count.data$snv1==as.character(output$Ref1[i]),] + nucl_counts = sapply(c("A", "C", "G", "T"), function(nucl) { sum(alleles_snv_ref$snv2==nucl, na.rm=T) }) + linked.muts = rbind(linked.muts, data.frame(output[i,], + Num_linked_to_A=nucl_counts[["A"]], + Num_linked_to_C=nucl_counts[["C"]], + Num_linked_to_G=nucl_counts[["G"]], + Num_linked_to_T=nucl_counts[["T"]])) + } + + # Categorise where the mutation is with respect to the CN event + ACGT = 10:13 + names(ACGT) <- c("A", "C", "G", "T") + linked.muts$Parental <- rep(NA, dim(linked.muts)[1]) + if (nrow(linked.muts) > 0) { + for (i in 1:nrow(linked.muts)) { + + # Fetch allele frequency + af = linked.muts$AF[i] + # Get number of reads covering the ref mutation allele + ref_count = hap.info[hap.info$pos==linked.muts$Pos2[i],]$ref_count + # Get number of reads covering the alt mutation allele + alt_count = hap.info[hap.info$pos==linked.muts$Pos2[i],]$mut_count + # Number of reads covering SNP allele A, that also cover mutation alt + linked_to_A = linked.muts[i,ACGT[linked.muts$Ref2[i]]] + # Number of reads covering SNP allele B, that also cover mutation alt + linked_to_B = linked.muts[i,ACGT[linked.muts$Var2[i]]] + + if (af < 0.5 & alt_count==1 & linked_to_A > 0 & linked_to_B == 0) { + linked.muts$Parental[i] = "MUT_ON_DELETED" + } else if (af < 0.5 & alt_count==1 & linked_to_A == 0 & linked_to_B > 0) { + linked.muts$Parental[i] = "MUT_ON_RETAINED" + } else if (af > 0.5 & alt_count==1 & linked_to_A > 0 & linked_to_B == 0) { + linked.muts$Parental[i] = "MUT_ON_RETAINED" + } else if (af > 0.5 & alt_count==1 & linked_to_A == 0 & linked_to_B > 0) { + linked.muts$Parental[i] = "MUT_ON_DELETED" + } else if (af > 0.5 & ref_count==1 & linked_to_A > 0 & linked_to_B == 0) { + linked.muts$Parental[i] = "MUT_ON_DELETED" + } else if (af > 0.5 & ref_count==1 & linked_to_A == 0 & linked_to_B > 0) { + linked.muts$Parental[i] = "MUT_ON_RETAINED" + } else if (af < 0.5 & ref_count==1 & linked_to_A > 0 & linked_to_B == 0) { + linked.muts$Parental[i] = "MUT_ON_RETAINED" + } else if (af < 0.5 & ref_count==1 & linked_to_A == 0 & linked_to_B > 0) { + linked.muts$Parental[i] = "MUT_ON_DELETED" + } + } + } + } + + write.table(linked.muts,outfile, sep="\t", quote=F, row.names=F) +} + + + +get_allele_combination_counts = function(bam, dat_pair) { + alleles = data.frame() + + # alleles = lapply(seq(1, (length(bam$seq)), 2), function(j) { + for (j in seq(1, (length(bam$seq)), 2)) { + rel_pos_snv1 = NA + read_snv1 = NA + rel_pos_snv2 = NA + read_snv2 = NA + read_1 = NA + read_2 = NA + + rel_pos_read1_snv1 = (start(dat_pair[1])-bam$pos[j]) + 1 + rel_pos_read2_snv1 = (start(dat_pair[1])-bam$mpos[j]) + 1 + if (rel_pos_read1_snv1 < bam$qwidth[j] & rel_pos_read1_snv1 > 0) { + rel_pos_snv1 = rel_pos_read1_snv1 + read_snv1 = j + read_1 = "first" + } else if (rel_pos_read2_snv1 < bam$qwidth[j] & rel_pos_read2_snv1 > 0) { + rel_pos_snv1 = rel_pos_read2_snv1 + read_snv1 = j+1 + read_1 = "second" + } + + rel_pos_read1_snv2 = (start(dat_pair[2])-bam$pos[j]) + 1 + rel_pos_read2_snv2 = (start(dat_pair[2])-bam$mpos[j]) + 1 + if (rel_pos_read1_snv2 < bam$qwidth[j] & rel_pos_read1_snv2 > 0) { + rel_pos_snv2 = rel_pos_read1_snv2 + read_snv2 = j + read_2 = "first" + } else if (rel_pos_read2_snv2 < bam$qwidth[j] & rel_pos_read2_snv2 > 0) { + rel_pos_snv2 = rel_pos_read2_snv2 + read_snv2 = j+1 + read_2 = "second" + } + + if (!is.na(rel_pos_snv1)) { + base_snv1 = substr(as.character(bam$seq[read_snv1]), rel_pos_snv1, rel_pos_snv1) + } else { + base_snv1 = NA + } + + if (!is.na(rel_pos_snv2)) { + base_snv2 = substr(as.character(bam$seq[read_snv2]), rel_pos_snv2, rel_pos_snv2) + } else { + base_snv2 = NA + } + alleles = rbind(alleles, data.frame(snv1=base_snv1, snv2=base_snv2, read_1=read_1, read_2=read_2)) + # data.frame(snv1=base_snv1, snv2=base_snv2, read_1=read_1, read_2=read_2) + }#) + + # alleles = do.call(rbind, alleles) + alleles_m = na.omit(alleles) + return(alleles_m) +} + diff --git a/inst/todo/preprocessing_alternative_phasing.R b/inst/todo/preprocessing_alternative_phasing.R new file mode 100644 index 0000000..11c2c1b --- /dev/null +++ b/inst/todo/preprocessing_alternative_phasing.R @@ -0,0 +1,1239 @@ +ALLELECOUNTER = "alleleCounter" +LINKAGEPULL = "Linkage_pull.pl" + +#' Concatenate split files +#' +#' Convenience function to concatenate a series of files specified in a file of file names. +#' This function assumes all files have the same layout. +#' @param fofn A file of file names to be concatenated +#' @param inputdir Full path to where the input files are stored +#' @param outfile Full path to where the output should be written +#' @param haveHeader Boolean that specifies whether the input files have a header +#' @author sd11 +#' @export +concat_files = function(fofn, inputdir, outfile, haveHeader) { + + inputdir = paste(inputdir,"/", sep="") + list_of_files = read.table(fofn, stringsAsFactors=F, header=F)[,1] + + output = data.frame() + for (infile in list_of_files) { + infile = paste(inputdir, infile, sep="") + # Check if file is there and it contains data + if (file.exists(infile) & file.info(infile)$size != 0) { + dat = read.delim(infile, header=haveHeader, quote=NULL, stringsAsFactors=F) + output = rbind(output, dat) + } + } + + write.table(output, file=outfile, col.names=haveHeader, row.names=F, sep="\t", quote=F) +} + +#' Split a file per chromosome +#' +#' Convenience function to split an input file per chromosome. All it requires is that +#' the infile has as first column chromosome specification. The output files will be named +#' outdir/prefixCHROMNUMBERpostfix +#' @param infile The file to be split +#' @param prefix Prefix of the output file +#' @param postfix Postfix of the output file +#' @param outdir Directory where the output files are to be written +#' @param chrom_file A simple list of chromosomes to be considered +#' @author sd11 +#' @export +split_by_chrom = function(infile, prefix, postfix, outdir, chrom_file) { + outdir = paste(outdir, "/", sep="") + + # Check if there are lines in the file, otherwise it will crash this script + if (file.info(infile)$size == 0) { + print("No lines in loci file") + q(save="no") + } + + loci = read.delim(infile, stringsAsFactors=F, header=F) + + chroms = read.delim(chrom_file, stringsAsFactors=F, header=F) + + for (i in 1:nrow(chroms)) { + selection = loci[loci[,1]==chroms[i,1],] + chrom_id = chroms[i,2] + write.table(selection, file=paste(outdir, prefix, chrom_id, postfix, sep=""), quote=F, row.names=F, col.names=F, sep="\t") + } +} + +############################################ +# VCF 2 LOCI +############################################ +#' Parse genome index +#' +#' Convenience function that parses a reference genome index as generated +#' by samtools index +#' @param fai_file The index +#' @return A data frame with columns "chromosome", "length", "offset", "fasta_line_length", "line_blen" +#' @author sd11 +#' @export +parseFai = function(fai_file) { + fai = read.table(fai_file, header=F, stringsAsFactors=F) + colnames(fai) = c("chromosome", "length", "offset", "fasta_line_length", "line_blen") + return(fai) +} + +#' Parse chromosomes to ignore file +#' +#' Convenience function that parses an ignore file. This file +#' is expected to have a single column with just chromosome names +#' @param ignore_file The file specifying to be ignored chromosomes +#' @return A data frame with a single column named "chromosome" +#' @author sd11 +#' @export +parseIgnore = function(ignore_file) { + ign = read.table(ignore_file, header=F, stringsAsFactors=F) + colnames(ign) = c("chromosome") + return(ign) +} + +#' Transform vcf to loci file +#' +#' Function that dumps the loci of snvs from a series of vcf files into a single loci file +#' @param vcf_files A vector of vcf files to be considered +#' @param fai_file Reference genome index +#' @param ign_file A file with chromosomes to be excluded from consideration +#' @param outfile Where to store the output +#' @author sd11 +#' @export +vcf2loci = function(vcf_files, fai_file, ign_file, outfile) { + fai = parseFai(fai_file) + ign = parseIgnore(ign_file) + allowed_chroms = which(!(fai$chromosome %in% ign$chromosome)) + + # Run through each supplied vcf file, collect the loci from each + combined.loci = data.frame() + for (vcf_file in vcf_files) { + vcf.cols = ncol(read.delim(vcf_file, comment.char="#", header=F, stringsAsFactor=F, nrows=1)) + vcf.cols.default = 10 # vcf file standard contains 10 columns + vcf.colClasses = c(NA, NA, "NULL", NA, NA, rep("NULL", 5+(vcf.cols-vcf.cols.default))) + vcf.loci = read.delim(vcf_file, comment.char="#", header=F, stringsAsFactor=F, colClasses=vcf.colClasses) + colnames(vcf.loci) = c("chromosome", "pos", "ref","alt") + vcf.loci.sel = subset(vcf.loci, chromosome %in% fai$chromosome[allowed_chroms]) + combined.loci = rbind(combined.loci, vcf.loci.sel) + } + # Remove duplicate entries + combined.loci = unique(combined.loci) + write.table(combined.loci, col.names=F, quote=F, row.names=F, file=outfile, sep="\t") +} + +############################################ +# Mutation signatures +############################################ + +#' Obtain tri-nucleotide context. It reads in the supplied loci file and querries +#' the provided reference genome fasta for the context. Finally it writes out the +#' loci with annotated context. +#' @param loci_file A 4 column loci file with chrom, position, reference allele and alt allele +#' @param outfile Where to write the output +#' @param ref_genome Full path to a reference genome Fasta file +#' @author sd11 +#' @export +getTrinucleotideContext = function(loci_file, outfile, ref_genome) { + loci = read.table(loci_file, sep='\t', header=F, stringsAsFactors=F) + loci.g = GenomicRanges::GRanges(seqnames=loci[,1], ranges=IRanges::IRanges(loci[,2], loci[,2])) + r = Rsamtools::scanFa(file=ref_genome, resize(GenomicRanges::granges(loci.g), 3, fix="center")) + write.table(cbind(loci, as.data.frame(r)), file=outfile, row.names=F, col.names=F, quote=F, sep="\t") +} + +# #' Checks each tri-nucleotide context whether its CAG or CTG +# #' @param vector_of_trinucleotides A vector containing the base to check and its immediate neighbors +# #' @author sd11 +# isDeaminase = function(vector_of_trinucleotides) { +# return(grepl("(CAG)|(CTG)|(GAC)|(GTC)", vector_of_trinucleotides)) +# } + +#' Filter a with tri-nucleotide context annotated list of loci for a particular signature +#' supplied as a regex like so: (CAG)|(CTG)|(GAC)|(GTC). +#' @param signature_anno_loci_file Filepath to a with tri-nucleotide context annotated loci +#' @param signature_regex A regex that captures the signature to keep. +#' @param outfile Filepath to where to store the output. +#' @param trinucleotide_column Integer representing the column within the input file that contains the context +#' @param alt_alleles An optional vector with reference alleles to allow. +#' @param alt_allele_column The column in the annotated loci file that contains the reference base. +#' @author sd11 +#' @export +filterForSignature = function(signature_anno_loci_file, signature_regex, outfile, trinucleotide_column=5, alt_alleles=NA, alt_allele_column=4) { + signature_anno_loci = read.table(signature_anno_loci_file, sep='\t', header=F, stringsAsFactors=F) + signature_anno_loci_filt = signature_anno_loci[grepl(signature_regex, signature_anno_loci[,trinucleotide_column]), ] + + if (!is.na(alt_alleles)) { + regex_split = gsub("(", "", signature_regex, fixed=T) + regex_split = gsub(")", "", regex_split, fixed=T) + regex_split = unlist(strsplit(regex_split, "|", fixed=T)) + selection = rep(F, nrow(signature_anno_loci_filt)) + + # Go through all signatures and their specific reference context + for (i in 1:length(regex_split)) { + alt_i = alt_alleles[i] + sign_i = regex_split[i] + # Select only those mutations that have this particular context and reference + selection[signature_anno_loci_filt[,trinucleotide_column]==sign_i & signature_anno_loci_filt[,alt_allele_column]==alt_i] = T + } + signature_anno_loci_filt = signature_anno_loci_filt[selection,] + } + write.table(signature_anno_loci_filt, file=outfile, row.names=F, col.names=F, quote=F, sep="\t") +} + +#' Convenience function that filters a with tri-nucleotide context annotated list of loci for +#' cytosine deaminase signature, or C>T at CpG. +#' @param signature_anno_loci_file Filepath to a with tri-nucleotide context annotated loci +#' @param outfile Filepath to where to store the output. +#' @param ref_genome Full path to an indexed reference genome fasta file +#' @param trinucleotide_column Integer representing the column within the input file that contains the context +#' @param alt_allele_column The column in the annotated loci file that contains the reference base. +#' @author sd11 +#' @export +filterForDeaminase = function(loci_file, outfile, ref_genome, trinucleotide_column=5, alt_allele_column=4) { + signature_anno_loci_file = gsub(".txt", "_signature_anno.txt", loci_file) + getTrinucleotideContext(loci_file, signature_anno_loci_file, ref_genome) + filterForSignature(signature_anno_loci_file=signature_anno_loci_file, + signature_regex="(CCG)|(GCC)|(CGG)|(GGC)", + outfile=outfile, + trinucleotide_column=trinucleotide_column, + alt_alleles=c("T", "T", "A", "A"), + alt_allele_column=alt_allele_column) +} + + +############################################ +# Allele counting +############################################ +#' Run alleleCount +#' +#' Count the alleles for specified locations in the loci file. Expects alleleCount binary in $PATH +#' @param locifile A file with at least chromsome and position columns of the locations to be counted +#' @param bam A bam file +#' @param outfile Where to store the output +#' @param min_baq The minimum base quality required for a read to be counted +#' @param min_maq The minimum mapping quality required for a read to be counted +#' @author sd11 +#' @export +alleleCount = function(locifile, bam, outfile, min_baq=20, min_maq=35) { + cmd = paste(ALLELECOUNTER, + "-b", bam, + "-o", outfile, + "-l", locifile, + "-m", min_baq, + "-q", min_maq, sep=" ") + system(cmd, wait=T) +} + +#' Dump allele counts from vcf - Sanger ICGC pancancer pipeline +#' +#' Dump allele counts stored in the sample columns of the VCF file. Output will go into a file +#' supplied as tumour_outfile and optionally normal_outfile. It will be a fully formatted +#' allele counts file as returned by alleleCounter. +#' @param vcf_infile The vcf file to read in +#' @param tumour_outfile File to save the tumour counts to +#' @param normal_outfile Optional parameter specifying where the normal output should go +#' @param refence_genome Optional parameter specifying the reference genome build used +#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the VCF +#' @author sd11 +#' @export +dumpCounts.Sanger = function(vcf_infile, tumour_outfile, normal_outfile=NA, refence_genome="hg19", samplename=NA) { + dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="sanger", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename) +} + +#' Dump allele counts from vcf - DKFZ ICGC pancancer pipeline +#' +#' Dump allele counts stored in the sample columns of the VCF file. Output will go into a file +#' supplied as tumour_outfile and optionally normal_outfile. It will be a fully formatted +#' allele counts file as returned by alleleCounter. +#' @param vcf_infile The vcf file to read in +#' @param tumour_outfile File to save the tumour counts to +#' @param normal_outfile Optional parameter specifying where the normal output should go +#' @param refence_genome Optional parameter specifying the reference genome build used +#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the VCF +#' @author sd11 +#' @export +dumpCounts.DKFZ = function(vcf_infile, tumour_outfile, normal_outfile=NA, refence_genome="hg19", samplename=NA) { + dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="dkfz", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename) +} + +#' Dump allele counts from vcf - Broad ICGC pancancer pipeline +#' +#' Dump allele counts stored in the sample columns of the VCF file. Output will go into a file +#' supplied as tumour_outfile and optionally normal_outfile. It will be a fully formatted +#' allele counts file as returned by alleleCounter. +#' @param vcf_infile The vcf file to read in +#' @param tumour_outfile File to save the tumour counts to +#' @param normal_outfile Optional parameter specifying where the normal output should go +#' @param refence_genome Optional parameter specifying the reference genome build used +#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the VCF +#' @author sd11 +#' @export +dumpCounts.Broad = function(vcf_infile, tumour_outfile, normal_outfile=NA, refence_genome="hg19", samplename=NA) { + dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="broad", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename) +} + +#' Dump allele counts from vcf - MuSE ICGC pancancer pipeline +#' +#' Dump allele counts stored in the sample columns of the VCF file. Output will go into a file +#' supplied as tumour_outfile and optionally normal_outfile. It will be a fully formatted +#' allele counts file as returned by alleleCounter. +#' @param vcf_infile The vcf file to read in +#' @param tumour_outfile File to save the tumour counts to +#' @param normal_outfile Optional parameter specifying where the normal output should go +#' @param refence_genome Optional parameter specifying the reference genome build used +#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the VCF +#' @author sd11 +#' @export +dumpCounts.Muse = function(vcf_infile, tumour_outfile, normal_outfile=NA, refence_genome="hg19", samplename=NA) { + dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="muse", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename) +} + +#' Dump allele counts from vcf - ICGC pancancer consensus pipeline +#' +#' Dump allele counts stored in the info column of the VCF file. Output will go into a file +#' supplied as tumour_outfile. It will be a fully formatted allele counts file as returned +#' by alleleCounter. There are no counts for the matched normal. +#' @param vcf_infile The vcf file to read in +#' @param tumour_outfile File to save the tumour counts to +#' @param normal_outfile Optional parameter specifying where the normal output should go +#' @param refence_genome Optional parameter specifying the reference genome build used +#' @param samplename Optional parameter specifying the samplename to be used for matching the right column in the VCF +#' @author sd11 +#' @export +dumpCounts.ICGCconsensus = function(vcf_infile, tumour_outfile, normal_outfile=NA, refence_genome="hg19", samplename=NA) { + dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="icgc_consensus", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename) +} + +#' Dump allele counts from vcf - Mutect +#' +#' Dump allele counts stored in the info column of the VCF file. Output will go into a file +#' supplied as tumour_outfile. It will be a fully formatted allele counts file as returned +#' by alleleCounter. +#' @param vcf_infile The vcf file to read in +#' @param tumour_outfile File to save the tumour counts to +#' @param samplename Parameter specifying the samplename to be used for matching the right column in the VCF +#' @param normal_outfile Optional parameter specifying where the normal output should go +#' @param refence_genome Optional parameter specifying the reference genome build used +#' @author sd11 +#' @export +dumpCounts.mutect = function(vcf_infile, tumour_outfile, samplename, normal_outfile=NA, refence_genome="hg19") { + dumpCountsFromVcf(vcf_infile, tumour_outfile, centre="mutect", normal_outfile=normal_outfile, refence_genome=refence_genome, samplename=samplename) +} + +#' Dump allele counts from VCF +#' +#' This function implements all the steps required for dumping counts from VCF +#' as supplied by the ICGC pancancer pipelines. +#' @noRd +dumpCountsFromVcf = function(vcf_infile, tumour_outfile, centre, normal_outfile=NA, refence_genome="hg19", samplename=NA) { + # Helper function for writing the output + write.output = function(output, output_file) { + write.table(output, file=output_file, col.names=T, quote=F, row.names=F, sep="\t") + } + + # Read in the vcf and dump the tumour counts in the right format + v = VariantAnnotation::readVcf(vcf_infile, refence_genome) + tumour = getCountsTumour(v, centre=centre, samplename=samplename) + tumour = formatOutput(tumour, v) + write.output(tumour, tumour_outfile) + + # Optionally dump the normal counts in the right format + if (!is.na(normal_outfile)) { + normal = getCountsNormal(v, centre=centre, samplename=samplename) + normal = formatOutput(normal, v) + write.output(normal, normal_outfile) + } +} + +#' Format a 4 column counts table into the alleleCounter format. This function assumes A, C, G, T format. +#' @noRd +formatOutput = function(counts_table, v) { + output = data.frame(as.character(seqnames(v)), start(ranges(v)), counts_table, rowSums(counts_table)) + colnames(output) = c("#CHR","POS","Count_A","Count_C","Count_G","Count_T","Good_depth") + return(output) +} + +#' Dump allele counts from vcf for normal +#' +#' Returns an allele counts table for the normal sample +#' @param v The vcf file +#' @param centre The sequencing centre of which pipeline the vcf file originates +#' @return An array with 4 columns: Counts for A, C, G, T +#' @author sd11 +#' @noRd +getCountsNormal = function(v, centre="sanger", samplename=NA) { + if (centre=="sanger") { + return(getAlleleCounts.Sanger(v, 1)) + } else if(centre=="dkfz") { + sample_col = which(colnames(VariantAnnotation::geno(v)$DP4) == "CONTROL") + return(getAlleleCounts.DKFZ(v, sample_col)) + } else if (centre=="muse") { + # Assuming the tumour name is provided + sample_col = which(colnames(VariantAnnotation::geno(v)$AD) != samplename) + return (getAlleleCounts.MuSE(v, sample_col)) + } else if (centre=="broad") { + print("The Broad ICGC pipeline does not report allele counts for the matched normal") + q(save="no", status=1) + } else if (centre=="icgc_consensus") { + print("The ICGC consensus pipeline does not report allele counts for the matched normal") + q(save="no", status=1) + } else if (centre=="mutect") { + sample_col = which(colnames(VariantAnnotation::geno(v)$AD) == samplename) + return(getAlleleCounts.mutect(v, sample_col)) + } else { + print(paste("Supplied centre not supported:", centre)) + q(save="no", status=1) + } +} + +#' Dump allele counts from vcf for tumour +#' +#' Returns an allele counts table for the tumour sample +#' @param v The vcf file +#' @param centre The sequencing centre of which pipeline the vcf file originates +#' @return An array with 4 columns: Counts for A, C, G, T +#' @author sd11 +#' @noRd +getCountsTumour = function(v, centre="sanger", samplename=NA) { + if (centre=="sanger") { + return(getAlleleCounts.Sanger(v, 2)) + } else if (centre=="dkfz") { + sample_col = which(colnames(VariantAnnotation::geno(v)$DP4) == "TUMOR") + return(getAlleleCounts.DKFZ(v, sample_col)) + } else if (centre=="muse") { + sample_col = which(colnames(VariantAnnotation::geno(v)$AD) == samplename) + return (getAlleleCounts.MuSE(v, sample_col)) + } else if (centre=="broad") { + return(getAlleleCounts.Broad(v, 1)) + } else if (centre=="icgc_consensus") { + return(getAlleleCounts.ICGC_consensus(v)) + } else if (centre=="mutect") { + sample_col = which(colnames(VariantAnnotation::geno(v)$AD) == samplename) + return(getAlleleCounts.mutect(v, sample_col)) + } else { + print(paste("Supplied centre not supported:", centre)) + q(save="no", status=1) + } +} + +#' Dump allele counts from Sanger pipeline vcf +#' +#' Helper function that dumps the allele counts from a Sanger pipeline VCF file +#' @param v The vcf file +#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1 +#' @return An array with 4 columns: Counts for A, C, G, T +#' @author sd11 +#' @noRd +getAlleleCounts.Sanger = function(v, sample_col) { + return(cbind(VariantAnnotation::geno(v)$FAZ[,sample_col]+VariantAnnotation::geno(v)$RAZ[,sample_col], + VariantAnnotation::geno(v)$FCZ[,sample_col]+VariantAnnotation::geno(v)$RCZ[,sample_col], + VariantAnnotation::geno(v)$FGZ[,sample_col]+VariantAnnotation::geno(v)$RGZ[,sample_col], + VariantAnnotation::geno(v)$FTZ[,sample_col]+VariantAnnotation::geno(v)$RTZ[,sample_col])) +} + +#' Dump allele counts from the DKFZ pipeline +#' +#' Helper function that takes a sample column and fetches allele counts. As the DKFZ pipeline does not +#' provide counts for each base, but just the alt and reference, we will provide just those and the +#' other bases with 0s. +#' +#' Note: If there are multiple ALT alleles this function will only take the first mentioned! +#' @param v The vcf file +#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1 +#' @return An array with 4 columns: Counts for A, C, G, T +#' @author sd11 +#' @noRd +getAlleleCounts.DKFZ = function(v, sample_col) { + # Fetch counts for both forward and reverse ref/alt + counts = VariantAnnotation::geno(v)$DP4[,sample_col,] + counts.ref = counts[,1] + counts[,2] # ref forward/reverse + counts.alt = counts[,3] + counts[,4] # alt forward/reverse + allele.ref = as.character(VariantAnnotation::ref(v)) + allele.alt = unlist(lapply(VariantAnnotation::alt(v), function(x) { as.character(x[[1]]) })) + + output = construct_allelecounter_table(count.ref, count.alt, allele.ref, allele.alt) + return(output) +} + +#' Dump allele counts from the MuSE pipeline +#' +#' Helper function that takes a sample column and fetches allele counts. As the MuSE pipeline does not +#' provide counts for each base, but just the alt and reference, we will provide just those and the +#' other bases with 0s. +#' +#' Note: If there are multiple ALT alleles this function will only take the first mentioned! +#' Note2: This function assumes there are only two columns with read counts. The format allows for more +#' @param v The vcf file +#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1 +#' @return An array with 4 columns: Counts for A, C, G, T +#' @author sd11 +#' @noRd +getAlleleCounts.MuSE = function(v, sample_col) { + if (length(colnames(VariantAnnotation::geno(v)$AD)) > 2) { + print("In getAlleleCounts.MuSE: Assuming 2 columns with read counts, but found more. This is not supported") + q(save="no", status=1) + } + + # An SNV can be represented by more than 1 alt alleles, here we pick the alt allele with the highest read count + num.snvs = nrow(VariantAnnotation::geno(v)$AD) + counts = array(NA, c(num.snvs, 2)) + allele.alt = array(NA, num.snvs) + for (i in 1:num.snvs) { + snv.counts = unlist(VariantAnnotation::geno(v)$AD[i,sample_col]) + counts[i,1] = snv.counts[1] # The reference is the first base for which read counts are mentioned + select_base = which.max(snv.counts[2:length(snv.counts)]) + allele.alt[i] = as.character(VariantAnnotation::alt(v)[[i]][select_base]) + select_base = select_base+1 # The reference is the first base for which read counts are mentioned + counts[i,2] = snv.counts[select_base] + } + + allele.ref = as.character(VariantAnnotation::ref(v)) + + output = construct_allelecounter_table(counts[i,1], counts[i,2], allele.ref, allele.alt) + return(output) +} + +#' Dump allele counts from the Broad pipeline +#' +#' Helper function that takes a sample column and fetches allele counts. As the Broad pipeline does not +#' provide counts for each base, but just the alt and reference, we will provide just those and the +#' other bases with 0s. +#' +#' Note: If there are multiple ALT alleles this function will only take the first mentioned! +#' @param v The vcf file +#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1 +#' @return An array with 4 columns: Counts for A, C, G, T +#' @author sd11 +#' @noRd +getAlleleCounts.Broad = function(v, sample_col) { + count.ref = as.numeric(unlist(VariantAnnotation::geno(v)$ref_count)) + count.alt = as.numeric(unlist(VariantAnnotation::geno(v)$alt_count)) + allele.ref = as.character(VariantAnnotation::ref(v)) + allele.alt = unlist(lapply(VariantAnnotation::alt(v), function(x) { as.character(x[[1]]) })) + + output = construct_allelecounter_table(count.ref, count.alt, allele.ref, allele.alt) + return(output) +} + +#' Dump allele counts in ICGC consensus SNV format +#' +#' This function fetches allele counts from the info field in the VCF file. +#' +#' @param v The vcf file +#' @return An array with 4 columns: Counts for A, C, G, T +#' @author sd11 +#' @noRd +#' Note: If there are multiple ALT alleles this function will only take the first mentioned! +getAlleleCounts.ICGC_consensus = function(v) { + count.alt = info(v)$t_alt_count + count.ref = info(v)$t_ref_count + allele.ref = as.character(VariantAnnotation::ref(v)) + allele.alt = unlist(lapply(VariantAnnotation::alt(v), function(x) { as.character(x[[1]]) })) + + output = construct_allelecounter_table(count.ref, count.alt, allele.ref, allele.alt) + return(output) +} + +#' Dump allele counts in Mutect format +#' +#' This function fetches allele counts from the info field in the VCF file. +#' +#' @param v The vcf file +#' @param sample_col The column in which the counts are. If it's the first sample mentioned in the vcf this would be sample_col 1 +#' @return An array with 4 columns: Counts for A, C, G, T +#' @author sd11 +#' @noRd +#' Note: If there are multiple ALT alleles this function will only take the first mentioned! +getAlleleCounts.mutect = function(v, sample_col) { + counts = do.call(rbind, geno(v)$AD[,sample_col]) + count.ref = counts[,1] + count.alt = counts[,2] + allele.ref = as.character(VariantAnnotation::ref(v)) + allele.alt = unlist(lapply(VariantAnnotation::alt(v), function(x) { as.character(x[[1]]) })) + output = construct_allelecounter_table(count.ref, count.alt, allele.ref, allele.alt) + return(output) +} + +#' Function that constructs a table in the format of the allele counter +#' @param count.ref Number of reads supporting the reference allele +#' @param count.alt Number of reads supporting the variant allele +#' @param allele.ref The reference allele +#' @param allele.alt The variant allele +#' @return A data.frame consisting of four columns: Reads reporting A, C, G and T +#' @author sd11 +#' @noRd +construct_allelecounter_table = function(count.ref, count.alt, allele.ref, allele.alt) { + output = array(0, c(length(allele.ref), 4)) + nucleotides = c("A", "C", "G", "T") + # Propagate the alt allele counts + nucleo.index = match(allele.alt, nucleotides) + for (i in 1:nrow(output)) { + output[i,nucleo.index[i]] = count.alt[i] + } + + # Propagate the reference allele counts + nucleo.index = match(allele.ref, nucleotides) + for (i in 1:nrow(output)) { + output[i,nucleo.index[i]] = count.ref[i] + } + return(output) +} + +############################################ +# MUT 2 MUT phasing +############################################ +#' Run linkage pull on SNVs vs SNVs +#' @noRd +run_linkage_pull_mut = function(output, loci_file, bam_file, bai_file) { + # + # Runs Linkage_pull + # + nearbyMuts_file = paste(loci_file, "_nearbyMuts.txt", sep="") + write.table(output, nearbyMuts_file, sep="\t", quote=F, col.names=F, row.names=F) + + zoomed_phase_file = paste(loci_file, "_zoomed_phase_info.txt", sep="") + cmd=paste(LINKAGEPULL," ",nearbyMuts_file," ",bam_file," ",bai_file," ",zoomed_phase_file," Mut",sep="") + print(paste("command=",cmd,sep="")) + system(cmd,wait=T) + + count.data = read.delim(zoomed_phase_file,sep="\t",header=T, stringsAsFactors=F) + + return(count.data) +} + +#' #' Phase mutation to mutation +#' #' +#' #' Run mutation to mutation phasing. This function requires the Linkage_pull.pl script in $PATH. +#' #' @param loci_file A list of loci +#' #' @param phased_file File to save the output +#' #' @param bam_file Full path to the bam file +#' #' @param bai_file Full path to the bai file +#' #' @param max_distance The max distance of a mutation and SNP can be apart to be considered for phasing +#' #' @author sd11, dw9 +#' #' @export +#' mut_mut_phasing = function(loci_file, phased_file, bam_file, bai_file, max_distance) { +#' # Check if there are lines in the file, otherwise it will crash this script +#' if (file.info(loci_file)$size == 0) { +#' print("No lines in loci file") +#' q(save="no") +#' } +#' +#' # TODO: this must be removed +#' chr.names = c(1:22,"X","Y") +#' +#' muts <- read.delim(loci_file, header=F, row.names=NULL, stringsAsFactors=F) +#' names(muts) = c("CHR","POSITION","WT","MT") +#' muts = muts[order(match(muts$CHR,chr.names),muts$POSITION),] +#' +#' # # TODO: Does this work with X and Y ?? +#' # if (!is.null(chrom)) { +#' # muts = muts[muts$CHR==chrom,] +#' # } +#' +#' # Pairwise comparison of all muts, only take those that are close to eachother +#' output <- data.frame(Chr = vector(mode="character",length=0), Pos1 = vector(mode="numeric",length=0), Ref1 = vector(mode="character",length=0), Var1 = vector(mode="character",length=0), Pos2 = vector(mode="numeric",length=0), Ref2 = vector(mode="character",length=0), Var2 = vector(mode="character",length=0)) +#' for(chr in chr.names){ +#' chr.muts = muts[muts$CHR==chr,] +#' no.muts = nrow(chr.muts) +#' for(i in 1:(no.muts-1)){ +#' dist = chr.muts$POSITION[(i+1):no.muts] - chr.muts$POSITION[i] +#' inds = which(dist <= max_distance) # 700 +#' if(length(inds)>0){ +#' output <- rbind(output,data.frame(Chr = chr, Pos1 = chr.muts$POSITION[i], Ref1 = chr.muts$WT[i], Var1 = chr.muts$MT[i], Pos2 = chr.muts$POSITION[i+inds], Ref2 = chr.muts$WT[i+inds], Var2 = chr.muts$MT[i+inds])) +#' } +#' } +#' } +#' +#' if (nrow(output) > 0) { +#' # Run linkage pull on the chromosome locations mentioned in the data.frame output +#' count.data = run_linkage_pull_mut(output, loci_file, bam_file, bai_file) +#' +#' # Categorise pairs of mutations +#' count.data$phasing = NA +#' for(h in 1:nrow(count.data)){ +#' counts = count.data[h,8:11] +#' print(counts) +#' if(counts[2]>0 & counts[3]+counts[4] == 0){ +#' count.data$phasing[h]="phased" +#' #}else if(counts[2]==0 & counts[3]+counts[4] > 0){ +#' #require BOTH WT-mut and mut-WT pairs +#' }else if(counts[2]==0 & counts[3]>0 & counts[4]>0){ +#' count.data$phasing[h]="anti-phased" +#' }else if(counts[2]>0 && counts[3]>0 && counts[4]==0){ +#' count.data$phasing[h]="clone-subclone" +#' }else if(counts[2]>0 && counts[3]==0 && counts[4]>0){ +#' count.data$phasing[h]="subclone-clone" +#' } +#' } +#' +#' write.table(count.data[!is.na(count.data$phasing),],phased_file,sep="\t",quote=F,row.names=F) +#' } +#' } + +############################################ +# MUT 2 CN phasing +############################################ +#' Run linkage pull on SNVs vs SNPs +#' @noRd +run_linkage_pull_snp = function(loci_file, bam_file, bai_file, chr, pos1, ref1, var1, pos2, ref2, var2, af, af_phased) { + # + # Runs the Linkage_pull.pl script with the given columns as its input. + # Returns a dataframe with the given columns, plus the linkage information that was pulled from the BAM file + # + + output = data.frame(Chr=chr, Pos1=pos1, Ref1=ref1, Var1=var1, Pos2=pos2, Ref2=ref2, Var2=var2, AF=af, AFphased=af_phased) + + linkedFile = paste(loci_file, "_muts_linkedSNPs.txt",sep="") + write.table(output[,1:7], linkedFile, sep="\t", quote=F, col.names=F, row.names=F) + + outfile = paste(loci_file, "_zoomed_mutcn_phase.txt", sep="") + cmd=paste(LINKAGEPULL," ",linkedFile," ",bam_file," ",bai_file," ",outfile," SNP",sep="") + print(paste("command=",cmd,sep="")) + system(cmd,wait=T) + + input <- read.delim(outfile, header=T, sep="\t", stringsAsFactors=F) + linked.muts <- cbind(output, input[,8:11]) + + return(linked.muts) +} + +#' #' Phase mutation to SNP/copy number +#' #' +#' #' Run mutation to copy number phasing. This function requires the Linkage_pull.pl script in $PATH. +#' #' Note: This function should either be run separately per chromosome and then combined with \code{\link{concat_files}} +#' #' or on all chromsomes in one go, but then the _allHaplotypeInfo.txt Battenberg files need to be concatenated first. +#' #' @param loci_file A list of loci +#' #' @param phased_file The .BAFsegmented.txt output file from Battenberg +#' #' @param hap_file Path to the _allHaplotypeInfo.txt Battenberg output file to be used +#' #' @param bam_file Full path to the bam file +#' #' @param bai_file Full path to the bai file +#' #' @param outfile File to save the output +#' #' @param max_distance The max distance of a mutation and SNP can be apart to be considered for phasing +#' #' @author sd11, dw9 +#' #' @export +#' mut_cn_phasing = function(loci_file, phased_file, hap_file, bam_file, bai_file, outfile, max_distance) { +#' +#' if (file.info(loci_file)$size == 0) { +#' linked.muts = data.frame(matrix(rep(NA, 13), nrow=1)) +#' colnames(linked.muts) = c("Chr","Pos1","Ref1","Var1","Pos2","Ref2","Var2","AF","AFphased","Num_linked_to_A","Num_linked_to_C","Num_linked_to_G","Num_linked_to_T") +#' linked.muts = na.omit(linked.muts) +#' } else if(nrow(read.delim(loci_file, header=F, stringsAsFactors=F, fill=T))==0) { +#' linked.muts = data.frame(matrix(rep(NA, 13), nrow=1)) +#' colnames(linked.muts) = c("Chr","Pos1","Ref1","Var1","Pos2","Ref2","Var2","AF","AFphased","Num_linked_to_A","Num_linked_to_C","Num_linked_to_G","Num_linked_to_T") +#' linked.muts = na.omit(linked.muts) +#' } else { +#' # TODO: is this filtering required when just supplying loci files from a single chromosome? +#' chr.muts = read.delim(loci_file, header=F, stringsAsFactors=F, fill=T) +#' names(chr.muts) = c("CHR","POSITION","REF_BASE","MUT_BASE") +#' +#' # Match phased SNPs and their haplotypes together +#' phased = read.delim(phased_file, header=T, stringsAsFactors=F, quote="\"") +#' # Compatible with both BB setups that have row.names and those that don't +#' if (ncol(phased) == 6) { +#' colnames(phased) = c("SNP", "Chr","Pos", "AF", "AFphased", "AFsegmented") +#' } else { +#' colnames(phased) = c("Chr","Pos", "AF", "AFphased", "AFsegmented") +#' } +#' +#' # TODO: check that chromosomes are using the same names between loci and phased files +#' phased = phased[phased$Chr %in% chr.muts$CHR,] +#' +#' hap.info = read.delim(hap_file, sep=" ", header=F, row.names=NULL, stringsAsFactors=F) +#' # Compatible with both BB setups that have row.names and those that don't +#' if (ncol(hap.info) == 7) { +#' colnames(hap.info) = c("SNP","dbSNP","pos","ref","mut","ref_count","mut_count") +#' } else { +#' colnames(hap.info) = c("dbSNP","pos","ref","mut","ref_count","mut_count") +#' } +#' # get haplotypes that match phased heterozygous SNPs +#' hap.info = hap.info[match(phased$Pos,hap.info$pos),] +#' +#' # Synchronise dfs in case some SNPs are not in hap.info +#' selection = !is.na(hap.info$pos) +#' hap.info = hap.info[selection,] +#' phased = phased[selection,] +#' +#' #220212 +#' #phased$AF[hap.info$ref_count==1] = 1-phased$AF[hap.info$ref_count==1] +#' phased$Ref = hap.info$ref +#' phased$Var = hap.info$mut +#' +#' # Annotate the chr.muts df with the min abs distance to a phased SNP +#' chr.muts$dist = sapply(1:dim(chr.muts)[1], function(i,p,m) min(abs(p$Pos - m$POSITION[i])), p=phased, m=chr.muts) +#' chr.muts$snp.index = sapply(1:dim(chr.muts)[1], function(i,p,m) which.min(abs(p$Pos - m$POSITION[i])), p=phased, m=chr.muts) +#' # Use only those to a SNP +#' muts <- chr.muts[chr.muts$dist < max_distance,] #700 +#' snps <- phased[muts$snp.index,] +#' +#' linked.muts = run_linkage_pull_snp(loci_file, bam_file, bai_file, muts$CHR, muts$POSITION, muts$REF_BASE, muts$MUT_BASE, snps$Pos, snps$Ref, snps$Var, snps$AF, snps$AFphased) +#' +#' # Categorise where the mutation is with respect to the CN event +#' ACGT = 10:13 +#' names(ACGT) <- c("A", "C", "G", "T") +#' linked.muts$Parental <- rep(NA, dim(linked.muts)[1]) +#' if (nrow(linked.muts) > 0) { +#' for (i in 1:nrow(linked.muts)) { +#' +#' # Fetch allele frequency +#' af = linked.muts$AF[i] +#' # Get number of reads covering the ref mutation allele +#' ref_count = hap.info[hap.info$pos==linked.muts$Pos2[i],]$ref_count +#' # Get number of reads covering the alt mutation allele +#' alt_count = hap.info[hap.info$pos==linked.muts$Pos2[i],]$mut_count +#' # Number of reads covering SNP allele A, that also cover mutation alt +#' linked_to_A = linked.muts[i,ACGT[linked.muts$Ref2[i]]] +#' # Number of reads covering SNP allele B, that also cover mutation alt +#' linked_to_B = linked.muts[i,ACGT[linked.muts$Var2[i]]] +#' +#' if (af < 0.5 & alt_count==1 & linked_to_A > 0 & linked_to_B == 0) { +#' linked.muts$Parental[i] = "MUT_ON_DELETED" +#' } else if (af < 0.5 & alt_count==1 & linked_to_A == 0 & linked_to_B > 0) { +#' linked.muts$Parental[i] = "MUT_ON_RETAINED" +#' } else if (af > 0.5 & alt_count==1 & linked_to_A > 0 & linked_to_B == 0) { +#' linked.muts$Parental[i] = "MUT_ON_RETAINED" +#' } else if (af > 0.5 & alt_count==1 & linked_to_A == 0 & linked_to_B > 0) { +#' linked.muts$Parental[i] = "MUT_ON_DELETED" +#' } else if (af > 0.5 & ref_count==1 & linked_to_A > 0 & linked_to_B == 0) { +#' linked.muts$Parental[i] = "MUT_ON_DELETED" +#' } else if (af > 0.5 & ref_count==1 & linked_to_A == 0 & linked_to_B > 0) { +#' linked.muts$Parental[i] = "MUT_ON_RETAINED" +#' } else if (af < 0.5 & ref_count==1 & linked_to_A > 0 & linked_to_B == 0) { +#' linked.muts$Parental[i] = "MUT_ON_RETAINED" +#' } else if (af < 0.5 & ref_count==1 & linked_to_A == 0 & linked_to_B > 0) { +#' linked.muts$Parental[i] = "MUT_ON_DELETED" +#' } +#' } +#' } +#' } +#' +#' write.table(linked.muts,outfile, sep="\t", quote=F, row.names=F) +#' } + +############################################ +# Copy number convert scripts +############################################ + +#' Transform ASCAT output into Battenberg +#' +#' This function takes the ascat segments, acf and ploidy files to create +#' a minimum Battenberg subclones and rho_and_psi file for use in pre-processing. +#' These files only contain the essentials required by this package. +#' @param outfile.prefix String prefix for the output files. Internally _subclones.txt and _rho_and_psi.txt will be added. +#' @param segments.file String pointing to a segments.txt ASCAT output file +#' @param acf.file String pointing to a acf.txt ASCAT output file +#' @param ploidy.file String pointing to a ploidy.txt ASCAT output file +#' @author sd11 +#' @export +ascatToBattenberg = function(outfile.prefix, segments.file, acf.file, ploidy.file) { + # Construct a minimum Battenberg file with the copy number segments + d = read.table(segments.file, header=T, stringsAsFactors=F) + subclones = d[,2:6] + colnames(subclones)[4] = "nMaj1_A" + colnames(subclones)[5] = "nMin1_A" + subclones$frac1_A = 1 + subclones$nMaj2_A = NA + subclones$nMin2_A = NA + subclones$frac2_A = NA + subclones$SDfrac_A = NA + write.table(subclones, paste(outfile.prefix, "_subclones.txt", sep=""), quote=F, sep="\t") + + # Now construct a minimum rho/psi file + cellularity = read.table(acf.file, header=T)[1,1] + ploidy = read.table(ploidy.file, header=T)[1,1] + + purity_ploidy = array(NA, c(3,5)) + colnames(purity_ploidy) = c("rho", "psi", "ploidy", "distance", "is.best") + rownames(purity_ploidy) = c("ASCAT", "FRAC_GENOME", "REF_SEG") + purity_ploidy["FRAC_GENOME", "rho"] = cellularity + purity_ploidy["FRAC_GENOME", "psi"] = ploidy + write.table(purity_ploidy, paste(outfile.prefix, "_rho_and_psi.txt", sep=""), quote=F, sep="\t") +} + +#' Transform ASCAT NGS output into Battenberg +#' +#' This function takes the ascat copynumber.caveman.csv and samplestatistics files to create +#' a minimum Battenberg subclones and rho_and_psi file for use in pre-processing. +#' These files only contain the essentials required by this package. +#' @param outfile.prefix String prefix for the output files. Internally _subclones.txt and _rho_and_psi.txt will be added. +#' @param copynumber.caveman.file String pointing to an ASCAT NGS copynumber.caveman.csv file +#' @param samplestatistics.file String point to an ASCAT NGS samplestatistics.csv file +#' @author sd11 +#' @export +ascatNgsToBattenberg = function(outfile.prefix, copynumber.caveman.file, samplestatistics.file) { + # Construct a minimum Battenberg file with the copy number segments + d = read.table(copynumber.caveman.file, sep=",", header=F, stringsAsFactors=F) + colnames(d) = c("count", "chr", "startpos", "endpos", "normal_total", "normal_minor", "tumour_total", "tumour_minor") + subclones = d[,2:4] + subclones$nMaj1_A = d$tumour_total-d$tumour_minor + subclones$nMin1_A = d$tumour_minor + subclones$frac1_A = 1 + subclones$nMaj2_A = NA + subclones$nMin2_A = NA + subclones$frac2_A = NA + write.table(subclones, paste(outfile.prefix, "_subclones.txt", sep=""), quote=F, sep="\t") + + # Now construct a minimum rho/psi file + samplestats = read.table(samplestatistics.file, header=F, stringsAsFactors=F) + + purity_ploidy = array(NA, c(3,5)) + colnames(purity_ploidy) = c("rho", "psi", "ploidy", "distance", "is.best") + rownames(purity_ploidy) = c("ASCAT", "FRAC_GENOME", "REF_SEG") + purity_ploidy["FRAC_GENOME", "rho"] = samplestats[samplestats$V1=="rho",2] + purity_ploidy["FRAC_GENOME", "psi"] = samplestats[samplestats$V1=="Ploidy",2] + write.table(purity_ploidy, paste(outfile.prefix, "_rho_and_psi.txt", sep=""), quote=F, sep="\t") +} + + +############################################ +# Combine all the steps into a DP input file +############################################ + +#' Check if the Y chromosome is present and the donor is male. If this statement is TRUE we need to +#' there is no estimate of the Y chromosome, which BB currently provides as two copies of X +#' @param subclone.data A read-in Battenberg subclones.txt file as a data.frame +#' @return The input data.frame with a Y chromosome entry added and possible X chromosome adjustment +#' @author sd11 +#' @noRd +addYchromToBattenberg = function(subclone.data) { + # Take the largest segment on X chromosome + xlargest = which.max(subclone.data[subclone.data$chr=="X", ]$endpos - subclone.data[subclone.data$chr=="X", ]$startpos) + xlargest = subclone.data[subclone.data$chr=="X", ][xlargest,] + + # Get the total availability of X + xnmaj = xlargest$nMaj1_A * xlargest$frac1_A + ifelse(is.na(xlargest$frac2_A), 0, xlargest$nMaj2_A * xlargest$frac2_A) + xnmin = xlargest$nMin1_A * xlargest$frac1_A + ifelse(is.na(xlargest$frac2_A), 0, xlargest$nMin2_A * xlargest$frac2_A) + + # If there are no two copies we assume that Y was lost and make no changes + if (xnmaj + xnmin >=2 & xnmin >= 1) { + # We have at least 1 copy of either allele. One must therefore be the Y chromosome + + # Remove a copy of the X chromosome + ynMaj_1 = subclone.data[subclone.data$chr=="X", c("nMin1_A")] + yfrac_1 = subclone.data[subclone.data$chr=="X", c("frac1_A")] + subclone.data[subclone.data$chr=="X", c("nMin1_A")] = 0 + + # if there was subclonal copy number we need to remove/adapt that too + if (!is.na(xlargest$frac2_A)) { + ynMaj_2 = subclone.data[subclone.data$chr=="X", c("nMin2_A")] + yfrac_2 = subclone.data[subclone.data$chr=="X", c("frac2_A")] + subclone.data[subclone.data$chr=="X", c("nMin2_A")] = 0 + } else { + ynMaj_2 = NA + yfrac_2 = NA + } + } else { + # No 2 copies, we assume that Y has been lost + ynMaj_1 = 0 + yfrac_1 = 1 + ynMaj_2 = NA + yfrac_2 = NA + } + + # Add Y + subclone.data = rbind(subclone.data, + data.frame(chr="Y", startpos=0, endpos=59373566, BAF=NA, pval=NA, LogR=NA, ntot=NA, + nMaj1_A=ynMaj_1, nMin1_A=0, frac1_A=yfrac_1, nMaj2_A=ynMaj_2, nMin2_A=NA, frac2_A=yfrac_2, SDfrac_A=NA, SDfrac_A_BS=NA, frac1_A_0.025=NA, frac1_A_0.975=NA, + nMaj1_B=NA, nMin1_B=NA, frac1_B=NA, nMaj2_B=NA, nMin2_B=NA, frac2_B=NA, SDfrac_B=NA, SDfrac_B_BS=NA, frac1_B_0.025=NA, frac1_B_0.975=NA, + nMaj1_C=NA, nMin1_C=NA, frac1_C=NA, nMaj2_C=NA, nMin2_C=NA, frac2_C=NA, SDfrac_C=NA, SDfrac_C_BS=NA, frac1_C_0.025=NA, frac1_C_0.975=NA, + nMaj1_D=NA, nMin1_D=NA, frac1_D=NA, nMaj2_D=NA, nMin2_D=NA, frac2_D=NA, SDfrac_D=NA, SDfrac_D_BS=NA, frac1_D_0.025=NA, frac1_D_0.975=NA, + nMaj1_E=NA, nMin1_E=NA, frac1_E=NA, nMaj2_E=NA, nMin2_E=NA, frac2_E=NA, SDfrac_E=NA, SDfrac_E_BS=NA, frac1_E_0.025=NA, frac1_E_0.975=NA, + nMaj1_F=NA, nMin1_F=NA, frac1_F=NA, nMaj2_F=NA, nMin2_F=NA, frac2_F=NA, SDfrac_F=NA, SDfrac_F_BS=NA, frac1_F_0.025=NA, frac1_F_0.975=NA) + ) + return(subclone.data) +} + +#' Main function that creates the DP input file. A higher level function should be called by users +#' @noRd +GetDirichletProcessInfo<-function(outputfile, cellularity, info, subclone.file, is.male = F, out.dir = NULL, SNP.phase.file = NULL, mut.phase.file = NULL, adjust_male_y_chrom=F){ + + subclone.data = read.table(subclone.file,sep="\t",header=T,stringsAsFactors=F) + # Add in the Y chrom if donor is male and Battenberg hasn't supplied it (BB returns X/Y ad multiple copies of X for men) + if (is.male & (! "Y" %in% subclone.data$chr) & adjust_male_y_chrom) { + subclone.data = addYchromToBattenberg(subclone.data) + } + subclone.data.gr = GenomicRanges::GRanges(subclone.data$chr, IRanges::IRanges(subclone.data$startpos, subclone.data$endpos), rep('*', nrow(subclone.data))) + elementMetadata(subclone.data.gr) = subclone.data[,3:ncol(subclone.data)] + subclone.data.gr = sortSeqlevels(subclone.data.gr) + + info_anno = as.data.frame(cbind(array(NA, c(length(info), 7)))) + colnames(info_anno) = c('subclonal.CN','nMaj1','nMin1','frac1','nMaj2','nMin2','frac2') + inds = findOverlaps(info, subclone.data.gr) + info_anno[queryHits(inds),2:7] = subclone.data[subjectHits(inds),][,c("nMaj1_A", "nMin1_A", "frac1_A", "nMaj2_A", "nMin2_A", "frac2_A")] + + CN1 = (info_anno[queryHits(inds),]$nMaj1 + info_anno[queryHits(inds),]$nMin1) * info_anno[queryHits(inds),]$frac1 + # If frac is not one for allele 1 (i.e. not only CN data for allele 1), add the CN contribution of allele 2 as well + CN2 = (info_anno[queryHits(inds),]$nMaj2 + info_anno[queryHits(inds),]$nMin2) * info_anno[queryHits(inds),]$frac2 * ifelse(info_anno[queryHits(inds),]$frac1 != 1, 1, 0) + CN2[is.na(CN2)] = 0 + info_anno[queryHits(inds),]$subclonal.CN = CN1 + CN2 + elementMetadata(info) = cbind(as.data.frame(elementMetadata(info)), info_anno) + + info$phase="unphased" + if (!is.null(SNP.phase.file) & SNP.phase.file!="NA") { + phasing = read.table(SNP.phase.file, header=T, stringsAsFactors=F) #header=T, skip=1, + phasing.gr = GenomicRanges::GRanges(phasing$Chr, IRanges::IRanges(phasing$Pos1, phasing$Pos1)) + phasing.gr$phasing = phasing$Parental + inds = findOverlaps(info, phasing.gr) + info$phase[queryHits(inds)] = phasing.gr$phasing[subjectHits(inds)] + + info$phase[is.na(info$phase)]="unphased" + } + + if(is.male & "chr" %in% names(info)){ + normal.CN = rep(2,nrow(info)) + normal.CN[info$chr=="X"| info$chr=="Y"] = 1 + info$mutation.copy.number = mutationBurdenToMutationCopyNumber(info$mut.count/ (info$mut.count + info$WT.count) , info$subclonal.CN, cellularity, normal.CN) + }else{ + info$mutation.copy.number = mutationBurdenToMutationCopyNumber(info$mut.count/ (info$mut.count + info$WT.count) , info$subclonal.CN, cellularity) + } + + # convert MCN to subclonal fraction - tricky for amplified mutations + info$subclonal.fraction = info$mutation.copy.number + expected.burden.for.MCN = mutationCopyNumberToMutationBurden(rep(1,length(info)),info$subclonal.CN,cellularity) + non.zero.indices = which(info$mut.count>0 & !is.na(expected.burden.for.MCN)) + #test for mutations in more than 1 copy + p.vals = sapply(1:length(non.zero.indices),function(v,e,i){ + prop.test(v$mut.count[i],v$mut.count[i] + v$WT.count[i],e[i],alternative="greater")$p.value + },v=info[non.zero.indices,], e=expected.burden.for.MCN[non.zero.indices]) + amplified.muts = non.zero.indices[p.vals<=0.05] + + info$no.chrs.bearing.mut = 1 + + #copy numbers of subclones can only differ by 1 or 0 (as assumed when calling subclones) + if(length(amplified.muts)>0){ + for(a in 1:length(amplified.muts)){ + max.CN2=0 + #use phasing info - if on 'deleted' (lower CN chromosome), use the minor copy number + if(info$phase[amplified.muts[a]]=="MUT_ON_DELETED"){ + print("mut on minor chromosome") + max.CN1 = info$nMin1[amplified.muts[a]] + frac1 = info$frac1[amplified.muts[a]] + frac2=0 + if(!is.na(info$nMin2[amplified.muts[a]])){ + #swap subclones, so that the one with the higher CN is first + if(info$nMin2[amplified.muts[a]]>max.CN1){ + max.CN2 = max.CN1 + max.CN1 = info$nMin2[amplified.muts[a]] + frac2 = frac1 + frac1 = info$frac2[amplified.muts[a]] + }else{ + max.CN2 = info$nMin2[amplified.muts[a]] + frac2 = info$frac2[amplified.muts[a]] + } + } + }else{ + max.CN1 = info$nMaj1[amplified.muts[a]] + frac1 = info$frac1[amplified.muts[a]] + frac2=0 + if(!is.na(info$nMaj2[amplified.muts[a]])){ + #swap subclones, so that the one with the higher CN is first + if(info$nMaj2[amplified.muts[a]]>max.CN1){ + max.CN2 = max.CN1 + max.CN1 = info$nMaj2[amplified.muts[a]] + frac2 = frac1 + frac1 = info$frac2[amplified.muts[a]] + }else{ + max.CN2 = info$nMaj2[amplified.muts[a]] + frac2 = info$frac2[amplified.muts[a]] + } + } + } + best.err = info$mutation.copy.number[amplified.muts[a]] - 1 + best.CN=1 + for(j in 1:max.CN1){ + for(k in (j-1):min(j,max.CN2)){ + potential.CN = j * frac1 + k * frac2 + err = abs(info$mutation.copy.number[amplified.muts[a]]/potential.CN-1) + if(err0){ + for(a in 1:length(subclonal.muts)){ + #if there are no subclonal CNVs, don't adjust subclonal fraction + if(is.na(info$frac2[subclonal.muts[a]])){next} + #assume subclonal muts are on one chromosome copy, therefore mutation copy number must be subclonal fraction of the higher CN subclone (i.e. lost in lower CN subclone) or 1 (i.e. present in both subclones) + if(info$nMaj1[subclonal.muts[a]]+info$nMin1[subclonal.muts[a]] > info$nMaj2[subclonal.muts[a]]+info$nMin2[subclonal.muts[a]]){ + possible.subclonal.fractions = c(info$frac1[subclonal.muts[a]],1) + }else{ + possible.subclonal.fractions = c(info$frac2[subclonal.muts[a]],1) + } + best.CN = possible.subclonal.fractions[which.min(abs(info$mutation.copy.number[subclonal.muts[a]]/possible.subclonal.fractions - 1))] + #extra test 200313 - check whether subclonal CN results in clonal mutation, otherwise subclonal CN doesn't explain subclonal MCN + if(best.CN != 1 & prop.test(info$mut.count[subclonal.muts[a]],info$mut.count[subclonal.muts[a]]+info$WT.count[subclonal.muts[a]],expected.burden.for.MCN[subclonal.muts[a]] * best.CN)$p.value > 0.05){ + info$subclonal.fraction[subclonal.muts[a]] = info$mutation.copy.number[subclonal.muts[a]] / best.CN + info$no.chrs.bearing.mut[subclonal.muts[a]] = best.CN + } + } + } + + possible.zero.muts = intersect((1:length(info))[-non.zero.indices],which(!is.na(info$nMin1))) + possible.zero.muts = c(possible.zero.muts,non.zero.indices[p.vals2>0.05]) + if(length(possible.zero.muts)>0){ + del.indices = which(info$nMin1[possible.zero.muts]==0 & !info$phase[possible.zero.muts]=="MUT_ON_RETAINED") + info$subclonal.fraction[possible.zero.muts[del.indices]] = NA + info$no.chrs.bearing.mut[possible.zero.muts[del.indices]] = 0 + } + + # convert GenomicRanges object to df + df = data.frame(chr=as.data.frame(seqnames(info)), + start=start(info)-1, + end=end(info)) + df = cbind(df, as.data.frame(elementMetadata(info))) + colnames(df)[1] = "chr" + df = df[with(df, order(chr)),] + print(head(df)) + write.table(df, outputfile, sep="\t", row.names=F, quote=F) +} + +#' Convenience function to load the cellularity from a rho_and_psi file +#' @noRd +GetCellularity <- function(rho_and_psi_file) { + d = read.table(rho_and_psi_file, header=T, stringsAsFactors=F) + return(d['FRAC_GENOME','rho']) +} + +#' Convenience function to fetch WTCount and mutCount +#'@noRd +GetWTandMutCount <- function(loci_file, allele_frequencies_file) { + subs.data = read.table(loci_file, sep='\t', header=F, stringsAsFactors=F) + subs.data = subs.data[order(subs.data[,1], subs.data[,2]),] + + # Replace dinucleotides and longer with just the first base. Here we assume the depth of the second base is the same and the number of dinucleotides is so low that removing the second base is negligable + subs.data[,3] = apply(as.data.frame(subs.data[,3]), 1, function(x) { substring(x, 1,1) }) + subs.data[,4] = apply(as.data.frame(subs.data[,4]), 1, function(x) { substring(x, 1,1) }) + + subs.data.gr = GenomicRanges::GRanges(subs.data[,1], IRanges::IRanges(subs.data[,2], subs.data[,2]), rep('*', nrow(subs.data))) + elementMetadata(subs.data.gr) = subs.data[,c(3,4)] + + alleleFrequencies = read.delim(allele_frequencies_file, sep='\t', header=T, quote=NULL, stringsAsFactors=F) + alleleFrequencies = alleleFrequencies[order(alleleFrequencies[,1],alleleFrequencies[,2]),] + print(head(alleleFrequencies)) + alleleFrequencies.gr = GenomicRanges::GRanges(alleleFrequencies[,1], IRanges::IRanges(alleleFrequencies[,2], alleleFrequencies[,2]), rep('*', nrow(alleleFrequencies))) + elementMetadata(alleleFrequencies.gr) = alleleFrequencies[,3:7] + + # Subset the allele frequencies by the loci we would like to include + overlap = findOverlaps(subs.data.gr, alleleFrequencies.gr) + alleleFrequencies = alleleFrequencies[subjectHits(overlap),] + + nucleotides = c("A","C","G","T") + ref.indices = match(subs.data[,3],nucleotides) + alt.indices = match(subs.data[,4],nucleotides) + WT.count = as.numeric(unlist(sapply(1:nrow(alleleFrequencies),function(v,a,i){v[i,a[i]+2]},v=alleleFrequencies,a=ref.indices))) + mut.count = as.numeric(unlist(sapply(1:nrow(alleleFrequencies),function(v,a,i){v[i,a[i]+2]},v=alleleFrequencies,a=alt.indices))) + + combined = data.frame(chr=subs.data[,1],pos=subs.data[,2],WTCount=WT.count, mutCount=mut.count) + colnames(combined) = c("chr","pos","WT.count","mut.count") + + combined.gr = GenomicRanges::GRanges(seqnames(subs.data.gr), ranges(subs.data.gr), rep('*', nrow(subs.data))) + elementMetadata(combined.gr) = data.frame(WT.count=WT.count, mut.count=mut.count) + + combined.gr = sortSeqlevels(combined.gr) + return(combined.gr) +} + +############################################## +# GetDirichletProcessInfo +############################################## +#' Create the DPClust input file +#' +#' Function that takes allele counts and a copy number profile to estimate mutation copy number, +#' cancer cell fraction and multiplicity for each point mutation. +#' @param loci_file Simple four column file with chromosome, position, reference allele and alternative allele +#' @param allele_frequencies_file Output file from alleleCounter on the specified loci +#' @param cellularity_file Full path to a Battenberg rho_and_psi output file +#' @param subclone_file Full path to a Battenberg subclones.txt output file +#' @param gender Specify male or female +#' @param SNP.phase.file Output file from mut_mut_phasing, supply NA (as char) when not available +#' @param mut.phase.file Output file from mut_cn_phasing, supply NA (as char) when not available +#' @param output_file Name of the output file +#' @author sd11 +#' @export +runGetDirichletProcessInfo = function(loci_file, allele_frequencies_file, cellularity_file, subclone_file, gender, SNP.phase.file, mut.phase.file, output_file) { + if(gender == 'male' | gender == 'Male') { + isMale = T + } else if(gender == 'female' | gender == 'Female') { + isMale = F + } else { + stop("Unknown gender supplied, exit.") + } + info_counts = GetWTandMutCount(loci_file, allele_frequencies_file) + cellularity = GetCellularity(cellularity_file) + GetDirichletProcessInfo(output_file, cellularity, info_counts, subclone_file, is.male=isMale, SNP.phase.file=SNP.phase.file, mut.phase.file=mut.phase.file) +} + +############################################## +# dpIn to VCF +############################################## +#' DPClust input file to vcf +#' +#' Transform a dirichlet input file into a VCF with the same info. It filters out mutations in areas that are not contained in the supplied genome index (fai file) or are contained in the ignore file (ign file) +#' It takes the DP input file created by runGetDirichletProcessInfo and combines the columns with the vcf file supplied. Finally it gzips and indexes the file +#' @param vcf_infile Filename of the VCF file to use as a base +#' @param dpIn_file Filename of a DP input file +#' @param vcf_outfile Filename of the output file +#' @param fai_file Path to a reference genome index containing chromosome names +#' @param ign_file Path to a file containing contigs to ignore +#' @param genome Specify the reference genome for reading in the VCF +#' @author sd11 +#' @export +dpIn2vcf = function(vcf_infile, dpIn_file, vcf_outfile, fai_file, ign_file, genome="hg19") { + vcf = VariantAnnotation::readVcf(vcf_infile, genome=genome) + + # Remove muts on chroms not to look at + fai = parseFai(fai_file) + ign = parseIgnore(ign_file) + allowed_chroms = which(!(fai$chromosome %in% ign$chromosome)) + vcf = vcf[as.vector(seqnames(vcf)) %in% allowed_chroms,] + + # Read in the to be annotated data + dat = read.table(dpIn_file, header=T, stringsAsFactors=F) + + # Annotate the columns into the VCF object + vcf = addVcfInfoCol(vcf, dat$WT.count, 1, "Integer", "Number of reads carrying the wild type allele", "WC") + vcf = addVcfInfoCol(vcf, dat$mut.count, 1, "Integer", "Number of reads carrying the mutant allele", "MC") + vcf = addVcfInfoCol(vcf, dat$subclonal.CN, 1, "Float", "Total subclonal copynumber", "TSC") + vcf = addVcfInfoCol(vcf, dat$nMaj1, 1, "Float", "Copynumber of major allele 1", "NMA1") + vcf = addVcfInfoCol(vcf, dat$nMin1, 1, "Float", "Copynumber of minor allele 1", "NMI1") + vcf = addVcfInfoCol(vcf, dat$frac1, 1, "Float", "Fraction of tumour cells containing copy number state 1", "FR1") + vcf = addVcfInfoCol(vcf, dat$nMaj2, 1, "Float", "Copynumber of major allele 2", "NMA2") + vcf = addVcfInfoCol(vcf, dat$nMin2, 1, "Float", "Copynumber of minor allele 2", "NMI2") + vcf = addVcfInfoCol(vcf, dat$frac2, 1, "Float", "Fraction of tumour cells containing copy number state 2", "FR2") + vcf = addVcfInfoCol(vcf, dat$phase, 1, "String", "Phase relation mutation and copynumber", "PHS") + vcf = addVcfInfoCol(vcf, dat$mutation.copy.number, 1, "Float", "Mutation copy number", "MCN") + vcf = addVcfInfoCol(vcf, dat$subclonal.fraction, 1, "Float", "Fraction of tumour cells carying this mutation", "CCF") + vcf = addVcfInfoCol(vcf, dat$no.chrs.bearing.mut, 1, "Float", "Number of chromosomes bearing the mutation", "NCBM") + + # Write the output, gzip and index it + VariantAnnotation::writeVcf(vcf, file=vcf_outfile, index=T) +} + +#' Convenience function that annotates a column into the supplied VCF object +#' @noRd +addVcfInfoCol = function(vcf, data, number, type, description, abbreviation) { + i = header(vcf)@header$INFO + exptData(vcf)$header@header$INFO <- rbind(i, S4Vectors::DataFrame(Number=number, Type=type, Description=description, row.names=abbreviation)) + info(vcf)[,abbreviation] <- as(data, "CharacterList") + return(vcf) +} diff --git a/man/alleleCount.Rd b/man/alleleCount.Rd index bf4ab48..63fef04 100644 --- a/man/alleleCount.Rd +++ b/man/alleleCount.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/preprocessing.R +% Please edit documentation in R/allelecount.R \name{alleleCount} \alias{alleleCount} \title{Run alleleCount} diff --git a/man/calc_power.Rd b/man/calc_power.Rd new file mode 100644 index 0000000..3222895 --- /dev/null +++ b/man/calc_power.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/util.R +\name{calc_power} +\alias{calc_power} +\title{Calculate power to call subclones.} +\usage{ +calc_power(purity, ploidy, coverage) +} +\arguments{ +\item{purity}{The sample purity} + +\item{ploidy}{The tumour ploidy} + +\item{coverage}{The tumour sample coverage} +} +\value{ +The average reads per chromosome copy, a.k.a. power +} +\description{ +This function calculates the +average number of reads per clonal chromosome copy. +} +\author{ +sd11 +} + diff --git a/man/concat_files.Rd b/man/concat_files.Rd index 64dd53a..5f44875 100644 --- a/man/concat_files.Rd +++ b/man/concat_files.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/preprocessing.R +% Please edit documentation in R/util.R \name{concat_files} \alias{concat_files} \title{Concatenate split files} diff --git a/man/createPng.Rd b/man/createPng.Rd new file mode 100644 index 0000000..6bbc65a --- /dev/null +++ b/man/createPng.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/qualitycontrol.R +\name{createPng} +\alias{createPng} +\title{Create a PNG file} +\usage{ +createPng(p, filename, width, height) +} +\arguments{ +\item{p}{The figure} + +\item{filename}{Where to save the image} + +\item{width}{Width} + +\item{height}{Height} +} +\description{ +Create a PNG file +} + diff --git a/man/dumpCounts.Broad.Rd b/man/dumpCounts.Broad.Rd index 357c806..c1b3b50 100644 --- a/man/dumpCounts.Broad.Rd +++ b/man/dumpCounts.Broad.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/preprocessing.R +% Please edit documentation in R/allelecount.R \name{dumpCounts.Broad} \alias{dumpCounts.Broad} \title{Dump allele counts from vcf - Broad ICGC pancancer pipeline} diff --git a/man/dumpCounts.DKFZ.Rd b/man/dumpCounts.DKFZ.Rd index 6fd6e48..8781da0 100644 --- a/man/dumpCounts.DKFZ.Rd +++ b/man/dumpCounts.DKFZ.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/preprocessing.R +% Please edit documentation in R/allelecount.R \name{dumpCounts.DKFZ} \alias{dumpCounts.DKFZ} \title{Dump allele counts from vcf - DKFZ ICGC pancancer pipeline} diff --git a/man/dumpCounts.ICGCconsensusIndel.Rd b/man/dumpCounts.ICGCconsensusIndel.Rd index fd9ea53..23090df 100644 --- a/man/dumpCounts.ICGCconsensusIndel.Rd +++ b/man/dumpCounts.ICGCconsensusIndel.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/preprocessing.R +% Please edit documentation in R/allelecount.R \name{dumpCounts.ICGCconsensusIndel} \alias{dumpCounts.ICGCconsensusIndel} \title{Dump allele counts from vcf - ICGC pancancer consensus indel pipeline} diff --git a/man/dumpCounts.ICGCconsensusSNV.Rd b/man/dumpCounts.ICGCconsensusSNV.Rd index 68de88b..705727f 100644 --- a/man/dumpCounts.ICGCconsensusSNV.Rd +++ b/man/dumpCounts.ICGCconsensusSNV.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/preprocessing.R +% Please edit documentation in R/allelecount.R \name{dumpCounts.ICGCconsensusSNV} \alias{dumpCounts.ICGCconsensusSNV} \title{Dump allele counts from vcf - ICGC pancancer consensus SNV pipeline} diff --git a/man/dumpCounts.Muse.Rd b/man/dumpCounts.Muse.Rd index ce7e529..999108b 100644 --- a/man/dumpCounts.Muse.Rd +++ b/man/dumpCounts.Muse.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/preprocessing.R +% Please edit documentation in R/allelecount.R \name{dumpCounts.Muse} \alias{dumpCounts.Muse} \title{Dump allele counts from vcf - MuSE ICGC pancancer pipeline} diff --git a/man/dumpCounts.Sanger.Rd b/man/dumpCounts.Sanger.Rd index a726e30..37dea9d 100644 --- a/man/dumpCounts.Sanger.Rd +++ b/man/dumpCounts.Sanger.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/preprocessing.R +% Please edit documentation in R/allelecount.R \name{dumpCounts.Sanger} \alias{dumpCounts.Sanger} \title{Dump allele counts from vcf - Sanger ICGC pancancer pipeline} diff --git a/man/dumpCounts.mutect.Rd b/man/dumpCounts.mutect.Rd index a3b78ed..29a1ec7 100644 --- a/man/dumpCounts.mutect.Rd +++ b/man/dumpCounts.mutect.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/preprocessing.R +% Please edit documentation in R/allelecount.R \name{dumpCounts.mutect} \alias{dumpCounts.mutect} \title{Dump allele counts from vcf - Mutect} diff --git a/man/filterForDeaminase.Rd b/man/filterForDeaminase.Rd index 29f64e3..0876ec7 100644 --- a/man/filterForDeaminase.Rd +++ b/man/filterForDeaminase.Rd @@ -9,6 +9,8 @@ filterForDeaminase(loci_file, outfile, ref_genome, trinucleotide_column = 5, alt_allele_column = 4) } \arguments{ +\item{loci_file}{Filepath to a with tri-nucleotide context annotated loci} + \item{outfile}{Filepath to where to store the output.} \item{ref_genome}{Full path to an indexed reference genome fasta file} @@ -16,8 +18,6 @@ filterForDeaminase(loci_file, outfile, ref_genome, trinucleotide_column = 5, \item{trinucleotide_column}{Integer representing the column within the input file that contains the context} \item{alt_allele_column}{The column in the annotated loci file that contains the reference base.} - -\item{signature_anno_loci_file}{Filepath to a with tri-nucleotide context annotated loci} } \description{ Convenience function that filters a with tri-nucleotide context annotated list of loci for diff --git a/man/plot_ccf_hist.Rd b/man/plot_ccf_hist.Rd new file mode 100644 index 0000000..cb1f771 --- /dev/null +++ b/man/plot_ccf_hist.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/qualitycontrol.R +\name{plot_ccf_hist} +\alias{plot_ccf_hist} +\title{Plot cancer cell fraction histogram} +\usage{ +plot_ccf_hist(data, samplename, outdir) +} +\arguments{ +\item{data}{A DPClust input table} + +\item{samplename}{Name of the sample} + +\item{outdir}{Directory where the figure is to be stored} +} +\description{ +Plot cancer cell fraction histogram +} +\author{ +sd11 +} + diff --git a/man/plot_mcn_hist.Rd b/man/plot_mcn_hist.Rd new file mode 100644 index 0000000..8d34e82 --- /dev/null +++ b/man/plot_mcn_hist.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/qualitycontrol.R +\name{plot_mcn_hist} +\alias{plot_mcn_hist} +\title{Plot mutation copy number histogram} +\usage{ +plot_mcn_hist(data, samplename, outdir) +} +\arguments{ +\item{data}{A DPClust input table} + +\item{samplename}{Name of the sample} + +\item{outdir}{Directory where the figure is to be stored} +} +\description{ +Plot mutation copy number histogram +} +\author{ +sd11 +} + diff --git a/man/plot_vaf_hist.Rd b/man/plot_vaf_hist.Rd new file mode 100644 index 0000000..7645373 --- /dev/null +++ b/man/plot_vaf_hist.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/qualitycontrol.R +\name{plot_vaf_hist} +\alias{plot_vaf_hist} +\title{Plot allele frequency histogram} +\usage{ +plot_vaf_hist(data, samplename, outdir) +} +\arguments{ +\item{data}{A DPClust input table} + +\item{samplename}{Name of the sample} + +\item{outdir}{Directory where the figure is to be stored} +} +\description{ +Plot allele frequency histogram +} +\author{ +sd11 +} + diff --git a/man/runQc.Rd b/man/runQc.Rd deleted file mode 100644 index b1127ae..0000000 --- a/man/runQc.Rd +++ /dev/null @@ -1,12 +0,0 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/qualitycontrol.R -\name{runQc} -\alias{runQc} -\title{Run the QC on specified data} -\usage{ -runQc(infile, datpath, outpath) -} -\description{ -Run the QC on specified data -} - diff --git a/man/split_by_chrom.Rd b/man/split_by_chrom.Rd index 788521f..ba5add5 100644 --- a/man/split_by_chrom.Rd +++ b/man/split_by_chrom.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2 (4.1.1): do not edit by hand -% Please edit documentation in R/preprocessing.R +% Please edit documentation in R/util.R \name{split_by_chrom} \alias{split_by_chrom} \title{Split a file per chromosome}