jts · rdeborja · Oct 19, 2021 · Oct 19, 2021 · Nov 11, 2021
diff --git a/workflow/rules/analysis.smk b/workflow/rules/analysis.smk
@@ -68,6 +68,18 @@ rule make_alleles:
     shell:
         "python {params.alleles_script} --reference-name MN908947.3 {input} > {output}"
 
+# parse the data directory and create the alleles file from sample VCFs
+rule make_alleles_vcf:
+    input:
+        get_data_directory
+    output:
+        "qc_analysis/{prefix}_alleles_vcf.tsv"
+    params:
+        script=srcdir("../scripts/vcf2alleles.py"),
+        pattern=get_variant_format_pattern
+    shell:
+        "python {params.script} --path {input} --pattern {params.pattern} > {output}"
+
 # assign lineages to the consensus genomes using pangolin
 rule make_lineage_assignments:
     input:

diff --git a/workflow/rules/defaults.smk b/workflow/rules/defaults.smk
@@ -2,6 +2,8 @@
 # Wrapper for accessing the config, with sensible defaults
 #
 
+import re
+
 #
 def get_bam_pattern():
     return config.get("bam_pattern", "{data_root}/{sample}.sorted.bam")
@@ -64,3 +66,14 @@ def get_watch_mutation_set(wildcards):
 #
 def get_voc_pango_lineages(wildcards):
     return config.get("voc_pango_lineages", "B.1.1.7,B.1.351,P.1,B.1.617.2")
+
+#
+def get_data_directory(wildcards=None):
+    return config["data_root"]
+
+#
+def get_variant_format_pattern(wildcards=None):
+    variant_format_pattern = config['variants_pattern']
+    _pattern = re.sub('{data_root}/{sample}', '', variant_format_pattern)
+    return _pattern
+
diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk
@@ -42,13 +42,14 @@ rule make_negative_control_report:
 rule make_mixture_report:
     input:
         fpileups="qc_sequencing/{prefix}_fpileups.fofn",
+        vcf_alleles="qc_analysis/{prefix}_alleles_vcf.tsv",
         alleles="qc_analysis/{prefix}_alleles.tsv"
     output:
         "qc_reports/{prefix}_mixture_report.tsv"
     params:
         script=srcdir("../scripts/mixture_check.py")
     shell:
-        "python {params.script} --fpileup {input.fpileups} --alleles {input.alleles} > {output}"
+        "python {params.script} --fpileup {input.fpileups} --alleles {input.vcf_alleles} > {output}"
 
 # make a simple report containing positions that frequently contain IUPAC ambiguity codes
 rule make_ambiguous_position_report:

diff --git a/workflow/scripts/plot/plot_recurrent_variant_heatmap_snpeff.R b/workflow/scripts/plot/plot_recurrent_variant_heatmap_snpeff.R
@@ -3,8 +3,6 @@ library(ggplot2)
 library(RColorBrewer)
 library(argparse)
 
-path = '/Users/rdeborja/Desktop/vars'
-
 get_func_consequence_levels <- function() {
   return(c(
     'chromosome_number_variation',

diff --git a/workflow/scripts/vcf2alleles.py b/workflow/scripts/vcf2alleles.py
@@ -0,0 +1,187 @@
+#!/usr/bin/env python
+
+
+from ncov.parser.Vcf import *
+import argparse
+import os
+import sys
+import re
+import csv
+from copy import deepcopy
+
+
+def init_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p', '--path', required=True,
+                        help='path containing VCF files to process')
+    parser.add_argument('-t', '--pattern', default='.variants.norm.vcf',
+                        help='the VCF file pattern (default: .variants.norm.vcf)')
+    return parser.parse_args()
+
+def create_default_sample(name, pos, ref, alt):
+    return {'samples' : [{'name' : name, 'pos': pos, 'ref': ref, 'alt': alt}]}
+
+
+def get_sample_name(file, pattern):
+    return re.sub(pattern, '', file)
+
+
+def write_alleles_line(row):
+    """
+    Write out sample mutation as follows:
+    * name
+    * pos
+    * ref_allele
+    * alt_allele
+    * samples_with_allele
+    """
+    samples_with_alleles = len(row['samples'])
+    for sample in row['samples']:
+        if samples_with_alleles < 1:
+            continue
+        # ignore indels
+        elif len(str(sample['ref'])) > 1 or len(str(sample['alt'])) > 1:
+            continue
+        else:
+            print('\t'.join([str(sample['name']),
+                             str(sample['pos']),
+                             str(sample['ref']),
+                             str(sample['alt']),
+                             str(samples_with_alleles)]))
+
+
+def is_ambiguous(vaf, upper_limit=0.75, lower_limit=0.25):
+    """
+    Determine whether the ALT allele should be an IUPAC code
+    based on the VAF.
+    """
+    assert upper_limit > lower_limit, "VAF upper_limit must be greater than lower_limit"
+    if vaf >= upper_limit:
+        return False
+    elif vaf < upper_limit and vaf >= lower_limit:
+        return True
+    else:
+        sys.exit('Invalid VAF or VAF less than 0.25')
+
+
+def get_iupac(ref, alt):
+    """
+    From the concatenated reference and alternate allele,
+    return the corresponding IUPAC code.
+    """
+    ref_alt = ''.join([ref, alt])
+    iupac_map = { "AC" : "M",
+                  "CA" : "M",
+                  "AG" : "R",
+                  "GA" : "R",
+                  "AT" : "W",
+                  "TA" : "W",
+                  "CG" : "S",
+                  "GC" : "S",
+                  "CT" : "Y",
+                  "TC" : "Y",
+                  "TG" : "K",
+                  "GT" : "K" }
+    return iupac_map[ref_alt]
+
+
+def separate_mutations(var):
+    """
+    Several mutations were identified as having multiple
+    nucleotide polymorphisms, however the mixture_check.py
+    script requires each mutation to be a separate line
+    in the alleles.tsv file.  This function returns
+    a list of variants.
+    """
+    var_orig = var
+    vars = list()
+    tmp_ref = str(var.REF)
+    tmp_alt = str(var.ALT[0])
+    tmp_pos = int(var.POS)
+    for idx, (ref, alt) in enumerate(zip(tmp_ref, tmp_alt)):
+        tmp_var = deepcopy(var)
+        tmp_var.REF = ref
+        tmp_var.ALT[0] = alt
+        if ref == alt:
+            continue
+        else:
+            tmp_var.POS = tmp_pos + int(idx)
+            vars.append(tmp_var)
+    return vars
+
+
+def is_mnp(var):
+    """
+    Check if the mutation is a multiple nucleotide polymorphism.
+    """
+    if len(str(var.ALT[0])) == len(str(var.REF)) and len(str(var.ALT[0])) > 1:
+        return True
+    elif len(str(var.ALT[0])) != len(str(var.REF)):
+        return False
+
+
+def get_variants_from_vcf(file):
+    """
+    Returns a list of single nucleotide variants from a VCF file.
+    """
+    _var_list = list()
+    _vcf_reader = vcf.Reader(filename=file)
+    for _var in _vcf_reader:
+        if is_mnp(var=_var):
+            _var_list.extend(separate_mutations(var=_var))
+        elif len(str(_var.REF)) == 1 and len(str(_var.ALT[0])) == 1:
+            _var_list.append(_var)
+        else:
+            continue
+    return _var_list
+
+
+
+def main():
+    """
+    Main program
+    """
+    args = init_args()
+    var_dict = dict()
+    # loop through all VCF files in the path provided
+    for file in os.listdir(args.path):
+        vcf_list = list()
+        if file.endswith(args.pattern):
+            vcf_reader = vcf.Reader(filename='/'.join([args.path, file]))
+            # import the variants in the VCF file while converting MNPs to
+            # SNPs
+            for var in vcf_reader:
+                if len(str(var.REF)) == 1 and len(str(var.ALT[0])) == 1:
+                    vcf_list.append(var)
+                # skip indels but process multiple nucleotide polymorphisms
+                elif is_mnp(var=var):
+                    vcf_list.extend(separate_mutations(var=var))
+                else:
+                    continue
+            # create the alleles dictionary
+            for _var in vcf_list: 
+                vaf = _var.INFO['VAF'][0]
+                if is_ambiguous(vaf):
+                    _var.ALT[0] = get_iupac(str(_var.REF), str(_var.ALT[0]))
+                var_id = create_vcf_variant_id(var=_var)
+                if var_id not in var_dict:
+                    var_dict[var_id] = create_default_sample(
+                            name=get_sample_name(file=file, pattern=args.pattern),
+                            pos=_var.POS,
+                            ref=_var.REF,
+                            alt=_var.ALT[0])
+                elif var_id in var_dict:
+                    var_dict[var_id]['samples'].append(
+                            {'name': get_sample_name(file=file, pattern=args.pattern),
+                             'pos': _var.POS, 'ref': _var.REF, 'alt': _var.ALT[0]})
+    print('\t'.join(['name', 'pos', 'ref_allele', 'alt_allele', 'samples_with_allele']))
+    for item in var_dict:
+        write_alleles_line(var_dict[item])
+
+
+if __name__ == '__main__':
+    main()
+
+
+#__END__
+