From b2e5d1f8e7a5dffa3eb5a7c3dc29a289ba0f8560 Mon Sep 17 00:00:00 2001
From: chapmanb <chapmanb@50mail.com>
Date: Mon, 15 Oct 2018 10:38:03 -0400
Subject: [PATCH] octopus: support for UMI and low frequency calling

Provides parameter adjustments based on discussions and validation work in
luntergroup/octopus#29

https://github.com/bcbio/bcbio_validations/tree/master/somatic-lowfreq#vardict-156-octopus-051b

- Parameter adjustments for low frequency and UMI samples based
  on Octopus configuration
- Correctly detected UMI bams lacking duplicates even on pre-aligned input BAMs.
- Fix issues with problem '*' alleles in outputs.
- Convert output file GTs into standard diploid calls.
---
 bcbio/variation/octopus.py | 73 +++++++++++++++++++++++++++++++++-----
 1 file changed, 64 insertions(+), 9 deletions(-)

diff --git a/bcbio/variation/octopus.py b/bcbio/variation/octopus.py
index 76dd1b331..710ca47c4 100644
--- a/bcbio/variation/octopus.py
+++ b/bcbio/variation/octopus.py
@@ -3,6 +3,9 @@
 https://github.com/luntergroup/octopus
 """
 import os
+import subprocess
+
+import pysam
 
 from bcbio import utils
 from bcbio.distributed.transaction import file_transaction
@@ -25,7 +28,7 @@ def run(align_bams, items, ref_file, assoc_files, region, out_file):
             return _run_germline(align_bams, items, ref_file, target, out_file)
     return out_file
 
-def _produce_compatible_vcf(out_file, data):
+def _produce_compatible_vcf(out_file, data, is_somatic):
     """Create a compatible VCF that downstream tools can deal with.
 
     - htsjdk and thus GATK and Picard do not support VCF4.3:
@@ -35,17 +38,48 @@ def _produce_compatible_vcf(out_file, data):
     - Fixes `##contig` lines since octopus only writes contigs
       used in the BED file region, causing incompatibilies with
       GatherVcfs when merging
+    - Fixes alleles prefixed with '*' like 'C,*T' which cause
+      downstream failures when reading with GATK.
     """
     base, ext = utils.splitext_plus(out_file)
     legacy_file = "%s.legacy%s" % (base, ext)
+    if is_somatic:
+        legacy_file = _covert_to_diploid(legacy_file, data)
     final_file = "%s.vcf.gz" % base
     cat_cmd = "zcat" if legacy_file.endswith(".gz") else "cat"
     contig_cl = vcfutils.add_contig_to_header_cl(dd.get_ref_file(data), out_file)
+    remove_problem_alleles = r"sed 's/,\*\([A-Z]\)/,\1/'"
     cmd = ("{cat_cmd} {legacy_file} | sed 's/fileformat=VCFv4.3/fileformat=VCFv4.2/' | "
-           "{contig_cl} | bgzip -c > {final_file}")
+           "{remove_problem_alleles} | {contig_cl} | bgzip -c > {final_file}")
     do.run(cmd.format(**locals()), "Produce compatible VCF output file from octopus")
     return vcfutils.bgzip_and_index(out_file, data["config"])
 
+def _covert_to_diploid(in_file, data):
+    """Converts non-diploid somatic outputs into diploid.
+
+    https://github.com/luntergroup/octopus/wiki/Case-study:-Tumour-only-UMI#evaluate-variant-calls
+    """
+    sample = dd.get_sample_name(data)
+    out_file = "%s-diploid.vcf" % utils.splitext_plus(in_file)[0]
+    in_vcf = pysam.VariantFile(in_file)
+    out_vcf = pysam.VariantFile(out_file, 'w', header=in_vcf.header)
+    for record in in_vcf:
+        gt = list(record.samples[sample]['GT'])
+        if 'SOMATIC' in record.info:
+            for allele in set(gt):
+                if allele != gt[0]:
+                    record.samples[sample]['GT'] = gt[0], allele
+                    out_vcf.write(record)
+        else:
+            if len(gt) == 1:
+                record.samples[sample]['GT'] = gt
+            else:
+                record.samples[sample]['GT'] = gt[0], gt[1]
+            out_vcf.write(record)
+    in_vcf.close()
+    out_vcf.close()
+    return vcfutils.bgzip_and_index(out_file, data["config"])
+
 def _run_germline(align_bams, items, ref_file, target, out_file):
     """Run germline calling, handling populations.
 
@@ -68,23 +102,44 @@ def _run_germline(align_bams, items, ref_file, target, out_file):
 def _run_somatic(paired, ref_file, target, out_file):
     """Run somatic calling with octopus, handling both paired and tumor-only cases.
 
-    TODO:
-      - Should we set downsample-above and downsample-target which default to 1000
-        and 500? How will these effect high depth panels and runtimes?
+    Tweaks for low frequency, tumor only and UMI calling documented in:
+    https://github.com/luntergroup/octopus/blob/develop/configs/UMI.config
     """
     align_bams = paired.tumor_bam
     if paired.normal_bam:
         align_bams += " %s --normal-sample %s" % (paired.normal_bam, paired.normal_name)
     cores = dd.get_num_cores(paired.tumor_data)
-    min_af = float(dd.get_min_allele_fraction(paired.tumor_data)) / 100.0
+    # Do not try to search below 0.4% currently as leads to long runtimes
+    # https://github.com/luntergroup/octopus/issues/29#issuecomment-428167979
+    min_af = max([float(dd.get_min_allele_fraction(paired.tumor_data)) / 100.0, 0.004])
+    min_af_floor = min_af / 4.0
     cmd = ("octopus --threads {cores} --reference {ref_file} --reads {align_bams} "
            "--regions-file {target} "
-           "--min-credible-somatic-frequency {min_af} "
-           "-C cancer "
+           "--min-credible-somatic-frequency {min_af_floor} --min-expected-somatic-frequency {min_af} "
+           "--downsample-above 4000 --downsample-target 4000 --min-kmer-prune 5 --min-bubble-score 20 "
+           "--max-haplotypes 200 --somatic-snv-mutation-rate '5e-4' --somatic-indel-mutation-rate '1e-05' "
+           "--target-working-memory 5G --target-read-buffer-footprint 5G --max-somatic-haplotypes 3 "
+           "--caller cancer "
            "--working-directory {tmp_dir} "
            "-o {tx_out_file} --legacy")
+    if not paired.normal_bam:
+        cmd += (" --tumour-germline-concentration 5")
+    if dd.get_umi_type(paired.tumor_data) or _is_umi_consensus_bam(paired.tumor_bam):
+        cmd += (" --allow-octopus-duplicates --overlap-masking 0 "
+                "--somatic-filter-expression 'GQ < 200 | MQ < 30 | SB > 0.2 | SD[.25] > 0.1 "
+                "| BQ < 40 | DP < 100 | MF > 0.1 | AD < 5 | CC > 1.1 | GQD > 2'")
     with file_transaction(paired.tumor_data, out_file) as tx_out_file:
         tmp_dir = os.path.dirname(tx_out_file)
         do.run(cmd.format(**locals()), "Octopus somatic calling")
-        _produce_compatible_vcf(tx_out_file, paired.tumor_data)
+        _produce_compatible_vcf(tx_out_file, paired.tumor_data, is_somatic=True)
     return out_file
+
+def _is_umi_consensus_bam(in_file):
+    """Check if input BAM file generated by fgbio consensus calls on UMIs.
+
+    Identify these by lack of duplicated reads.
+    This is useful for pre-aligned consensus BAMs feeding into octopus.
+    """
+    cmd = "samtools view -h %s | head -500000 | samtools view -c -f 1024"
+    count = subprocess.check_output(cmd % in_file, shell=True)
+    return int(count) == 0