diff --git a/workflow/rules/analysis.smk b/workflow/rules/analysis.smk index 851e9e8..bd17c5d 100644 --- a/workflow/rules/analysis.smk +++ b/workflow/rules/analysis.smk @@ -68,6 +68,18 @@ rule make_alleles: shell: "python {params.alleles_script} --reference-name MN908947.3 {input} > {output}" +# parse the data directory and create the alleles file from sample VCFs +rule make_alleles_vcf: + input: + get_data_directory + output: + "qc_analysis/{prefix}_alleles_vcf.tsv" + params: + script=srcdir("../scripts/vcf2alleles.py"), + pattern=get_variant_format_pattern + shell: + "python {params.script} --path {input} --pattern {params.pattern} > {output}" + # assign lineages to the consensus genomes using pangolin rule make_lineage_assignments: input: diff --git a/workflow/rules/defaults.smk b/workflow/rules/defaults.smk index 2a3b7de..4077994 100644 --- a/workflow/rules/defaults.smk +++ b/workflow/rules/defaults.smk @@ -2,6 +2,8 @@ # Wrapper for accessing the config, with sensible defaults # +import re + # def get_bam_pattern(): return config.get("bam_pattern", "{data_root}/{sample}.sorted.bam") @@ -64,3 +66,14 @@ def get_watch_mutation_set(wildcards): # def get_voc_pango_lineages(wildcards): return config.get("voc_pango_lineages", "B.1.1.7,B.1.351,P.1,B.1.617.2") + +# +def get_data_directory(wildcards=None): + return config["data_root"] + +# +def get_variant_format_pattern(wildcards=None): + variant_format_pattern = config['variants_pattern'] + _pattern = re.sub('{data_root}/{sample}', '', variant_format_pattern) + return _pattern + diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk index 3330282..19a29d9 100644 --- a/workflow/rules/report.smk +++ b/workflow/rules/report.smk @@ -42,13 +42,14 @@ rule make_negative_control_report: rule make_mixture_report: input: fpileups="qc_sequencing/{prefix}_fpileups.fofn", + vcf_alleles="qc_analysis/{prefix}_alleles_vcf.tsv", alleles="qc_analysis/{prefix}_alleles.tsv" output: "qc_reports/{prefix}_mixture_report.tsv" params: script=srcdir("../scripts/mixture_check.py") shell: - "python {params.script} --fpileup {input.fpileups} --alleles {input.alleles} > {output}" + "python {params.script} --fpileup {input.fpileups} --alleles {input.vcf_alleles} > {output}" # make a simple report containing positions that frequently contain IUPAC ambiguity codes rule make_ambiguous_position_report: diff --git a/workflow/scripts/plot/plot_recurrent_variant_heatmap_snpeff.R b/workflow/scripts/plot/plot_recurrent_variant_heatmap_snpeff.R index a2c61d2..753d9a3 100644 --- a/workflow/scripts/plot/plot_recurrent_variant_heatmap_snpeff.R +++ b/workflow/scripts/plot/plot_recurrent_variant_heatmap_snpeff.R @@ -3,8 +3,6 @@ library(ggplot2) library(RColorBrewer) library(argparse) -path = '/Users/rdeborja/Desktop/vars' - get_func_consequence_levels <- function() { return(c( 'chromosome_number_variation', diff --git a/workflow/scripts/vcf2alleles.py b/workflow/scripts/vcf2alleles.py new file mode 100644 index 0000000..18f3f96 --- /dev/null +++ b/workflow/scripts/vcf2alleles.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python + + +from ncov.parser.Vcf import * +import argparse +import os +import sys +import re +import csv +from copy import deepcopy + + +def init_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-p', '--path', required=True, + help='path containing VCF files to process') + parser.add_argument('-t', '--pattern', default='.variants.norm.vcf', + help='the VCF file pattern (default: .variants.norm.vcf)') + return parser.parse_args() + +def create_default_sample(name, pos, ref, alt): + return {'samples' : [{'name' : name, 'pos': pos, 'ref': ref, 'alt': alt}]} + + +def get_sample_name(file, pattern): + return re.sub(pattern, '', file) + + +def write_alleles_line(row): + """ + Write out sample mutation as follows: + * name + * pos + * ref_allele + * alt_allele + * samples_with_allele + """ + samples_with_alleles = len(row['samples']) + for sample in row['samples']: + if samples_with_alleles < 1: + continue + # ignore indels + elif len(str(sample['ref'])) > 1 or len(str(sample['alt'])) > 1: + continue + else: + print('\t'.join([str(sample['name']), + str(sample['pos']), + str(sample['ref']), + str(sample['alt']), + str(samples_with_alleles)])) + + +def is_ambiguous(vaf, upper_limit=0.75, lower_limit=0.25): + """ + Determine whether the ALT allele should be an IUPAC code + based on the VAF. + """ + assert upper_limit > lower_limit, "VAF upper_limit must be greater than lower_limit" + if vaf >= upper_limit: + return False + elif vaf < upper_limit and vaf >= lower_limit: + return True + else: + sys.exit('Invalid VAF or VAF less than 0.25') + + +def get_iupac(ref, alt): + """ + From the concatenated reference and alternate allele, + return the corresponding IUPAC code. + """ + ref_alt = ''.join([ref, alt]) + iupac_map = { "AC" : "M", + "CA" : "M", + "AG" : "R", + "GA" : "R", + "AT" : "W", + "TA" : "W", + "CG" : "S", + "GC" : "S", + "CT" : "Y", + "TC" : "Y", + "TG" : "K", + "GT" : "K" } + return iupac_map[ref_alt] + + +def separate_mutations(var): + """ + Several mutations were identified as having multiple + nucleotide polymorphisms, however the mixture_check.py + script requires each mutation to be a separate line + in the alleles.tsv file. This function returns + a list of variants. + """ + var_orig = var + vars = list() + tmp_ref = str(var.REF) + tmp_alt = str(var.ALT[0]) + tmp_pos = int(var.POS) + for idx, (ref, alt) in enumerate(zip(tmp_ref, tmp_alt)): + tmp_var = deepcopy(var) + tmp_var.REF = ref + tmp_var.ALT[0] = alt + if ref == alt: + continue + else: + tmp_var.POS = tmp_pos + int(idx) + vars.append(tmp_var) + return vars + + +def is_mnp(var): + """ + Check if the mutation is a multiple nucleotide polymorphism. + """ + if len(str(var.ALT[0])) == len(str(var.REF)) and len(str(var.ALT[0])) > 1: + return True + elif len(str(var.ALT[0])) != len(str(var.REF)): + return False + + +def get_variants_from_vcf(file): + """ + Returns a list of single nucleotide variants from a VCF file. + """ + _var_list = list() + _vcf_reader = vcf.Reader(filename=file) + for _var in _vcf_reader: + if is_mnp(var=_var): + _var_list.extend(separate_mutations(var=_var)) + elif len(str(_var.REF)) == 1 and len(str(_var.ALT[0])) == 1: + _var_list.append(_var) + else: + continue + return _var_list + + + +def main(): + """ + Main program + """ + args = init_args() + var_dict = dict() + # loop through all VCF files in the path provided + for file in os.listdir(args.path): + vcf_list = list() + if file.endswith(args.pattern): + vcf_reader = vcf.Reader(filename='/'.join([args.path, file])) + # import the variants in the VCF file while converting MNPs to + # SNPs + for var in vcf_reader: + if len(str(var.REF)) == 1 and len(str(var.ALT[0])) == 1: + vcf_list.append(var) + # skip indels but process multiple nucleotide polymorphisms + elif is_mnp(var=var): + vcf_list.extend(separate_mutations(var=var)) + else: + continue + # create the alleles dictionary + for _var in vcf_list: + vaf = _var.INFO['VAF'][0] + if is_ambiguous(vaf): + _var.ALT[0] = get_iupac(str(_var.REF), str(_var.ALT[0])) + var_id = create_vcf_variant_id(var=_var) + if var_id not in var_dict: + var_dict[var_id] = create_default_sample( + name=get_sample_name(file=file, pattern=args.pattern), + pos=_var.POS, + ref=_var.REF, + alt=_var.ALT[0]) + elif var_id in var_dict: + var_dict[var_id]['samples'].append( + {'name': get_sample_name(file=file, pattern=args.pattern), + 'pos': _var.POS, 'ref': _var.REF, 'alt': _var.ALT[0]}) + print('\t'.join(['name', 'pos', 'ref_allele', 'alt_allele', 'samples_with_allele'])) + for item in var_dict: + write_alleles_line(var_dict[item]) + + +if __name__ == '__main__': + main() + + +#__END__ +