diff --git a/bcbio/variation/octopus.py b/bcbio/variation/octopus.py index 76dd1b331..710ca47c4 100644 --- a/bcbio/variation/octopus.py +++ b/bcbio/variation/octopus.py @@ -3,6 +3,9 @@ https://github.com/luntergroup/octopus """ import os +import subprocess + +import pysam from bcbio import utils from bcbio.distributed.transaction import file_transaction @@ -25,7 +28,7 @@ def run(align_bams, items, ref_file, assoc_files, region, out_file): return _run_germline(align_bams, items, ref_file, target, out_file) return out_file -def _produce_compatible_vcf(out_file, data): +def _produce_compatible_vcf(out_file, data, is_somatic): """Create a compatible VCF that downstream tools can deal with. - htsjdk and thus GATK and Picard do not support VCF4.3: @@ -35,17 +38,48 @@ def _produce_compatible_vcf(out_file, data): - Fixes `##contig` lines since octopus only writes contigs used in the BED file region, causing incompatibilies with GatherVcfs when merging + - Fixes alleles prefixed with '*' like 'C,*T' which cause + downstream failures when reading with GATK. """ base, ext = utils.splitext_plus(out_file) legacy_file = "%s.legacy%s" % (base, ext) + if is_somatic: + legacy_file = _covert_to_diploid(legacy_file, data) final_file = "%s.vcf.gz" % base cat_cmd = "zcat" if legacy_file.endswith(".gz") else "cat" contig_cl = vcfutils.add_contig_to_header_cl(dd.get_ref_file(data), out_file) + remove_problem_alleles = r"sed 's/,\*\([A-Z]\)/,\1/'" cmd = ("{cat_cmd} {legacy_file} | sed 's/fileformat=VCFv4.3/fileformat=VCFv4.2/' | " - "{contig_cl} | bgzip -c > {final_file}") + "{remove_problem_alleles} | {contig_cl} | bgzip -c > {final_file}") do.run(cmd.format(**locals()), "Produce compatible VCF output file from octopus") return vcfutils.bgzip_and_index(out_file, data["config"]) +def _covert_to_diploid(in_file, data): + """Converts non-diploid somatic outputs into diploid. + + https://github.com/luntergroup/octopus/wiki/Case-study:-Tumour-only-UMI#evaluate-variant-calls + """ + sample = dd.get_sample_name(data) + out_file = "%s-diploid.vcf" % utils.splitext_plus(in_file)[0] + in_vcf = pysam.VariantFile(in_file) + out_vcf = pysam.VariantFile(out_file, 'w', header=in_vcf.header) + for record in in_vcf: + gt = list(record.samples[sample]['GT']) + if 'SOMATIC' in record.info: + for allele in set(gt): + if allele != gt[0]: + record.samples[sample]['GT'] = gt[0], allele + out_vcf.write(record) + else: + if len(gt) == 1: + record.samples[sample]['GT'] = gt + else: + record.samples[sample]['GT'] = gt[0], gt[1] + out_vcf.write(record) + in_vcf.close() + out_vcf.close() + return vcfutils.bgzip_and_index(out_file, data["config"]) + def _run_germline(align_bams, items, ref_file, target, out_file): """Run germline calling, handling populations. @@ -68,23 +102,44 @@ def _run_germline(align_bams, items, ref_file, target, out_file): def _run_somatic(paired, ref_file, target, out_file): """Run somatic calling with octopus, handling both paired and tumor-only cases. - TODO: - - Should we set downsample-above and downsample-target which default to 1000 - and 500? How will these effect high depth panels and runtimes? + Tweaks for low frequency, tumor only and UMI calling documented in: + https://github.com/luntergroup/octopus/blob/develop/configs/UMI.config """ align_bams = paired.tumor_bam if paired.normal_bam: align_bams += " %s --normal-sample %s" % (paired.normal_bam, paired.normal_name) cores = dd.get_num_cores(paired.tumor_data) - min_af = float(dd.get_min_allele_fraction(paired.tumor_data)) / 100.0 + # Do not try to search below 0.4% currently as leads to long runtimes + # https://github.com/luntergroup/octopus/issues/29#issuecomment-428167979 + min_af = max([float(dd.get_min_allele_fraction(paired.tumor_data)) / 100.0, 0.004]) + min_af_floor = min_af / 4.0 cmd = ("octopus --threads {cores} --reference {ref_file} --reads {align_bams} " "--regions-file {target} " - "--min-credible-somatic-frequency {min_af} " - "-C cancer " + "--min-credible-somatic-frequency {min_af_floor} --min-expected-somatic-frequency {min_af} " + "--downsample-above 4000 --downsample-target 4000 --min-kmer-prune 5 --min-bubble-score 20 " + "--max-haplotypes 200 --somatic-snv-mutation-rate '5e-4' --somatic-indel-mutation-rate '1e-05' " + "--target-working-memory 5G --target-read-buffer-footprint 5G --max-somatic-haplotypes 3 " + "--caller cancer " "--working-directory {tmp_dir} " "-o {tx_out_file} --legacy") + if not paired.normal_bam: + cmd += (" --tumour-germline-concentration 5") + if dd.get_umi_type(paired.tumor_data) or _is_umi_consensus_bam(paired.tumor_bam): + cmd += (" --allow-octopus-duplicates --overlap-masking 0 " + "--somatic-filter-expression 'GQ < 200 | MQ < 30 | SB > 0.2 | SD[.25] > 0.1 " + "| BQ < 40 | DP < 100 | MF > 0.1 | AD < 5 | CC > 1.1 | GQD > 2'") with file_transaction(paired.tumor_data, out_file) as tx_out_file: tmp_dir = os.path.dirname(tx_out_file) do.run(cmd.format(**locals()), "Octopus somatic calling") - _produce_compatible_vcf(tx_out_file, paired.tumor_data) + _produce_compatible_vcf(tx_out_file, paired.tumor_data, is_somatic=True) return out_file + +def _is_umi_consensus_bam(in_file): + """Check if input BAM file generated by fgbio consensus calls on UMIs. + + Identify these by lack of duplicated reads. + This is useful for pre-aligned consensus BAMs feeding into octopus. + """ + cmd = "samtools view -h %s | head -500000 | samtools view -c -f 1024" + count = subprocess.check_output(cmd % in_file, shell=True) + return int(count) == 0