nextstrain · rneher · Aug 27, 2018 · Aug 16, 2018 · Aug 22, 2018 · Aug 22, 2018
diff --git a/augur/clades.py b/augur/clades.py
@@ -0,0 +1,95 @@
+import os, sys
+import numpy as np
+from Bio import SeqIO, SeqFeature, Seq, SeqRecord, Phylo
+from .utils import read_node_data, write_json
+from collections import defaultdict
+
+def read_in_clade_definitions(clade_file):
+    '''
+    Reads in tab-seperated file that defines clades by amino-acid or nucleotide
+
+    Format:
+    clade	gene	site	alt
+    Clade_2	embC	940	S
+    Clade_2	Rv3463	192	K
+    Clade_3	Rv2209	432	I
+    Clade_4	nuc	27979	G
+    '''
+    import pandas as pd
+
+    clades = defaultdict(lambda:defaultdict(list))
+
+    df = pd.read_csv(clade_file, sep='\t' if clade_file.endswith('.tsv') else ',')
+    for mi, m in df.iterrows():
+        clades[m.clade][m.gene].append((m.site,m.alt))
+
+    return clades
+
+
+def is_node_in_clade(clade_mutations, node_muts):
+    '''
+    Determines whether a node contains all mutations that define a clade
+    '''
+    is_clade = False
+    # cycle through each gene in the clade definition and see if this node has the specified mutations
+    for gene, defining_mutations in clade_mutations.items():
+        #check the node has the gene in question and that it contains any mutations
+        if (gene in node_muts and node_muts[gene]):
+            # mutations are stored on nodes in format 'R927H', but in clade defining_mutations as (927, 'H')
+            # So convert node mutations ('mutation') to format (927, 'H') here, for easy comparison
+            formatted_mutations = [(int(mutation[1:-1]), mutation[-1]) for mutation in node_muts[gene]] 
+            # If all of the clade-defining mutations are in the node mutations, it's part of this clade.
+            if all([mut in formatted_mutations for mut in defining_mutations]):
+                is_clade = True
+            else:
+                return False
+        else:
+            return False
+
+    return is_clade
+
+
+def assign_clades(clade_designations, muts, tree):
+    '''
+    Ensures all nodes have an entry (or auspice doesn't display nicely), tests each node
+    to see if it's the first member of a clade (assigns 'clade_annotation'), and sets
+    all nodes's clade_membership to the value of their parent. This will change if later found to be
+    the first member of a clade.
+    '''
+    clades = {}
+    for n in tree.get_nonterminals(order = 'preorder'):
+        n_muts = {}
+        if 'aa_muts' in muts[n.name]:
+            n_muts = muts[n.name]['aa_muts']
+        if 'muts' in muts[n.name]:
+            n_muts['nuc'] = muts[n.name]['muts'] # Put nuc mutations in with 'nuc' as the 'gene' so all can be searched together
+
+        if n.name not in clades: # This ensures every node gets an entry - otherwise auspice doesn't display nicely
+            clades[n.name]={"clade_membership": "Unassigned"}
+        for clade, definition in clade_designations.items():
+            if is_node_in_clade(definition, n_muts):
+                clades[n.name] = {"clade_annotation":clade, "clade_membership": clade}
+
+        # Ensures each node is set to membership of their parent initially (unless changed later in tree traversal)
+        for c in n:
+            clades[c.name]={"clade_membership": clades[n.name]["clade_membership"] }
+
+    return clades
+
+
+def run(args):
+    ## read tree and data, if reading data fails, return with error code
+    tree = Phylo.read(args.tree, 'newick')
+    node_data = read_node_data(args.mutations, args.tree)
+    if node_data is None:
+        print("ERROR: could not read node data (incl sequences)")
+        return 1
+
+    clade_designations = read_in_clade_definitions(args.clades)
+
+    muts = node_data['nodes']
+
+    clades = assign_clades(clade_designations, muts, tree)
+
+    write_json({'nodes':clades}, args.output)
+    print("clades written to", args.output, file=sys.stdout)
diff --git a/augur/translate.py b/augur/translate.py
@@ -170,6 +170,17 @@ def construct_mut(start, pos, end):
 def assign_aa_vcf(tree, translations):
     aa_muts = {}
 
+    #get mutations on the root
+    root = tree.root
+    aa_muts[root.name]={"aa_muts":{}}
+    for fname, prot in translations.items():
+        root_muts = prot['sequences'][root.name]
+        tmp = []
+        for pos in prot['positions']:
+            if pos in root_muts:
+                tmp.append(construct_mut(prot['reference'][pos], int(pos+1), root_muts[pos]))
+        aa_muts[root.name]["aa_muts"][fname] = tmp
+
     for n in tree.get_nonterminals():
         for c in n:
             aa_muts[c.name]={"aa_muts":{}}
@@ -225,7 +236,7 @@ def run(args):
     if any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
         if not args.vcf_reference:
             print("ERROR: a reference Fasta is required with VCF-format input")
-            return -1
+            return 1
         compress_seq = read_vcf(args.ancestral_sequences, args.vcf_reference)
         sequences = compress_seq['sequences']
         ref = compress_seq['reference']
@@ -234,7 +245,7 @@ def run(args):
         node_data = read_node_data(args.ancestral_sequences, args.tree)
         if node_data is None:
             print("ERROR: could not read node data (incl sequences)")
-            return -1
+            return 1
         # extract sequences from node meta data
         sequences = {}
         for k,v in node_data['nodes'].items():
@@ -246,7 +257,7 @@ def run(args):
     print("Read in {} features from reference sequence file".format(len(features)))
     if features is None:
         print("ERROR: could not read features of reference sequence file")
-        return -1
+        return 1
 
     ### translate every feature - but not 'nuc'!
     translations = {}
@@ -282,6 +293,11 @@ def run(args):
         aa_muts = assign_aa_vcf(tree, translations)
     else:
         aa_muts = {}
+
+        #fasta input shouldn't have mutations on root, so give empty entry
+        root = tree.get_nonterminals()[0]
+        aa_muts[root.name]={"aa_muts":{}}
+
         for n in tree.get_nonterminals():
             for c in n:
                 aa_muts[c.name]={"aa_muts":{}}

diff --git a/bin/augur b/bin/augur
@@ -2,7 +2,7 @@
 import sys
 from augur import parse, filter, align, tree, refine, ancestral
 from augur import traits, translate, mask, titers, export
-from augur import validate, sequence_traits
+from augur import validate, sequence_traits, clades
 
 if __name__=="__main__":
     import argparse
@@ -127,6 +127,14 @@ if __name__=="__main__":
     translate_parser.add_argument('--vcf-reference', type=str, help='fasta file of the sequence the VCF was mapped to')
     translate_parser.set_defaults(func=translate.run)
 
+    ## CLADES.PY
+    clades_parser = subparsers.add_parser('clades', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    clades_parser.add_argument('--tree', help="prebuilt Newick -- no tree will be built if provided")
+    clades_parser.add_argument('--mutations', nargs='+', help='JSON(s) containing ancestral and tip nucleotide and/or amino-acid mutations ')
+    clades_parser.add_argument('--clades', type=str, help='TSV file containing clade definitions by amino-acid')
+    clades_parser.add_argument('--output', type=str, help="name of JSON files for clades")
+    clades_parser.set_defaults(func=clades.run)
+
     ## TRAITS.PY
     traits_parser = subparsers.add_parser('traits', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     traits_parser.add_argument('--tree', '-t', required=True, help="tree to perform trait reconstruction on")

diff --git a/builds/tb/Snakefile b/builds/tb/Snakefile
@@ -19,7 +19,8 @@ rule config:
         genes = "data/genes.txt",
         colors = "data/color.tsv",
         config = "data/config.json",
-        geo_info = "data/lat_longs.tsv"
+        geo_info = "data/lat_longs.tsv",
+        clades = "data/clades.tsv"
 
 config = rules.config.params #so we can use config.x rather than rules.config.params.x
 #end of config definition
@@ -117,14 +118,29 @@ rule translate:
             --alignment-output {output.vcf_out}
         """
 
+rule clades:
+    input:
+        tree = rules.refine.output.tree,
+        aa_muts = rules.translate.output.aa_data,
+        nuc_muts = rules.ancestral.output.nt_data,
+        clades = config.clades
+    output:
+        clade_data = "results/clades.json"
+    shell:
+        """
+        augur clades --tree {input.tree} \
+            --mutations {input.nuc_muts} {input.aa_muts} \
+            --output {output.clade_data} --clades {input.clades}
+        """
+
 rule traits:
     input:
         tree = rules.refine.output.tree,
         meta = config.meta
     output:
         "results/traits.json"
     params:
-        traits = 'location cluster'
+        traits = 'location'
     shell:
         'augur traits --tree {input.tree} --metadata {input.meta}'
         ' --columns {params.traits} --output {output}'
@@ -159,7 +175,8 @@ rule export:
         drms = rules.seqtraits.output.drm_data,
         color_defs = config.colors,
         config = config.config,
-        geo_info = config.geo_info
+        geo_info = config.geo_info,
+        clades = rules.clades.output.clade_data
     output:
         tree = rules.all.input.auspice_tree,
         meta = rules.all.input.auspice_meta
@@ -168,7 +185,7 @@ rule export:
         augur export \
             --tree {input.tree} \
             --metadata {input.metadata} \
-            --node-data {input.branch_lengths} {input.traits} {input.drms} {input.aa_muts} {input.nt_muts} \
+            --node-data {input.branch_lengths} {input.traits} {input.drms} {input.aa_muts} {input.nt_muts} {input.clades} \
             --auspice-config {input.config} \
             --colors {input.color_defs} \
             --output-tree {output.tree} \

diff --git a/builds/tb/data/clades.tsv b/builds/tb/data/clades.tsv
@@ -0,0 +1,14 @@
+clade	gene	site	alt
+Mj-I	ctpE	81	D
+Mj-II	nuc	30642	T
+Mj-III.a	nuc	444296	A
+Mj-III.b	nuc	1558108	T
+Mj-III.c	nuc	921390	C
+Mj-IV.a	pks8	634	T
+Mj-IV.b	Rv2752c	533	M
+Mj-IV.c	nuc	208073	T
+Mj-V.a	nuc	166518	A
+Mj-V.b	nuc	1497601	C
+Mj-V.c	nuc	1503915	T
+Mj-V.d	nuc	3825469	C
+Mj-VI	nuc	79748	G
diff --git a/builds/tb/data/color.tsv b/builds/tb/data/color.tsv
@@ -25,5 +25,18 @@ cluster	Mj-V.c	#6a3d9a
 cluster	Mj-VI	#fdbf6f
 cluster	Mn	#ff7f00
 
-
+clade_membership	Mj-I	#a6cee3
+clade_membership	Mj-II	#fb9a99
+clade_membership	Mj-III.a	#b15928
+clade_membership	Mj-III.b	#ffff99
+clade_membership	Mj-III.c	#00441b
+clade_membership	Mj-IV.a	#e31a1c
+clade_membership	Mj-IV.b	#33a02c
+clade_membership	Mj-IV.c	#cab2d6
+clade_membership	Mj-V.a	#1f78b4
+clade_membership	Mj-V.b	#67001f
+clade_membership	Mj-V.d	#b2df8a
+clade_membership	Mj-V.c	#6a3d9a
+clade_membership	Mj-VI	#fdbf6f
+clade_membership	Unassigned	#AAAAAA
 
diff --git a/builds/tb/data/config.json b/builds/tb/data/config.json
@@ -13,17 +13,21 @@
    "type": "continuous", 
    "key": "num_date"
   },
-  "location": {
+  "clade_membership": {
+   "menuItem": "Clade",
+   "legendTitle": "Clade",
+   "type": "discrete",
+   "key": "clade_membership"
   },
-  "cluster": {
+  "location": {
   }
  },
  "geo": [
    "location"
  ],
  "filters": [
   "location", 
-  "cluster", 
+  "clade_membership",
   "authors"
  ],
   "maintainer": [

diff --git a/tests/builds/tb/Snakefile b/tests/builds/tb/Snakefile
@@ -20,7 +20,8 @@ rule config:
         genes = "data/genes.txt",
         colors = "data/color.tsv",
         config = "data/config.json",
-        geo_info = "data/lat_longs.tsv"
+        geo_info = "data/lat_longs.tsv",
+        clades = "data/clades.tsv"
 
 config = rules.config.params #so we can use config.x rather than rules.config.params.x
 #end of config definition
@@ -118,6 +119,21 @@ rule translate:
             --alignment-output {output.vcf_out}
         """
 
+rule clades:
+    input:
+        tree = rules.refine.output.tree,
+        aa_muts = rules.translate.output.aa_data,
+        nuc_muts = rules.ancestral.output.nt_data,
+        clades = config.clades
+    output:
+        clade_data = "results/clades.json"
+    shell:
+        """
+        augur clades --tree {input.tree} \
+            --mutations {input.nuc_muts} {input.aa_muts} \
+            --output {output.clade_data} --clades {input.clades}
+        """
+
 rule traits:
     input:
         tree = rules.refine.output.tree,
@@ -141,7 +157,8 @@ rule export:
         aa_muts = rules.translate.output.aa_data,
         color_defs = config.colors,
         config = config.config,
-        geo_info = config.geo_info
+        geo_info = config.geo_info,
+        clades = rules.clades.output.clade_data
     output:
         tree = rules.all.input.auspice_tree,
         meta = rules.all.input.auspice_meta
@@ -150,7 +167,7 @@ rule export:
         augur export \
             --tree {input.tree} \
             --metadata {input.metadata} \
-            --node-data {input.branch_lengths} {input.traits} {input.aa_muts} {input.nt_muts} \
+            --node-data {input.branch_lengths} {input.traits} {input.aa_muts} {input.nt_muts} {input.clades} \
             --auspice-config {input.config} \
             --colors {input.color_defs} \
             --lat-longs {input.geo_info} \
@@ -170,7 +187,8 @@ rule exportv2:
         nt_muts = rules.ancestral.output.nt_data,
         aa_muts = rules.translate.output.aa_data,
         color_defs = config.colors,
-        geo_info = config.geo_info
+        geo_info = config.geo_info,
+        clades = rules.clades.output.clade_data
     output:
         main = rules.all.input.auspice_main
     params:
@@ -186,7 +204,7 @@ rule exportv2:
             --new-schema \
             --tree {input.tree} \
             --metadata {input.metadata} \
-            --node-data {input.branch_lengths} {input.traits} {input.aa_muts} {input.nt_muts} \
+            --node-data {input.branch_lengths} {input.traits} {input.aa_muts} {input.nt_muts} {input.clades} \
             --colors {input.color_defs} \
             --lat-longs {input.geo_info} \
             --output-main {output.main} \

diff --git a/tests/builds/tb/data/clades.tsv b/tests/builds/tb/data/clades.tsv
@@ -0,0 +1,14 @@
+clade	gene	site	alt
+Mj-I	ctpE	81	D
+Mj-II	nuc	30642	T
+Mj-III.a	nuc	444296	A
+Mj-III.b	nuc	1558108	T
+Mj-III.c	nuc	921390	C
+Mj-IV.a	pks8	634	T
+Mj-IV.b	Rv2752c	533	M
+Mj-IV.c	nuc	208073	T
+Mj-V.a	nuc	166518	A
+Mj-V.b	nuc	1497601	C
+Mj-V.c	nuc	1503915	T
+Mj-V.d	nuc	3825469	C
+Mj-VI	nuc	79748	G