Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clade Specification by AA and Nuc #206

Merged
merged 6 commits into from
Aug 27, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions augur/clades.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import os, sys
import numpy as np
from Bio import SeqIO, SeqFeature, Seq, SeqRecord, Phylo
from .utils import read_node_data, write_json
from collections import defaultdict

def read_in_clade_definitions(clade_file):
'''
Reads in tab-seperated file that defines clades by amino-acid or nucleotide

Format:
clade gene site alt
Clade_2 embC 940 S
Clade_2 Rv3463 192 K
Clade_3 Rv2209 432 I
Clade_4 nuc 27979 G
'''
import pandas as pd

clades = defaultdict(lambda:defaultdict(list))

df = pd.read_csv(clade_file, sep='\t' if clade_file.endswith('.tsv') else ',')
for mi, m in df.iterrows():
clades[m.clade][m.gene].append((m.site,m.alt))

return clades


def is_node_in_clade(clade_mutations, node_muts):
'''
Determines whether a node contains all mutations that define a clade
'''
is_clade = False
# cycle through each gene in the clade definition and see if this node has the specified mutations
for gene, defining_mutations in clade_mutations.items():
#check the node has the gene in question and that it contains any mutations
if (gene in node_muts and node_muts[gene]):
# mutations are stored on nodes in format 'R927H', but in clade defining_mutations as (927, 'H')
# So convert node mutations ('mutation') to format (927, 'H') here, for easy comparison
formatted_mutations = [(int(mutation[1:-1]), mutation[-1]) for mutation in node_muts[gene]]
# If all of the clade-defining mutations are in the node mutations, it's part of this clade.
if all([mut in formatted_mutations for mut in defining_mutations]):
is_clade = True
else:
return False
else:
return False

return is_clade


def assign_clades(clade_designations, muts, tree):
'''
Ensures all nodes have an entry (or auspice doesn't display nicely), tests each node
to see if it's the first member of a clade (assigns 'clade_annotation'), and sets
all nodes's clade_membership to the value of their parent. This will change if later found to be
the first member of a clade.
'''
clades = {}
for n in tree.get_nonterminals(order = 'preorder'):
n_muts = {}
if 'aa_muts' in muts[n.name]:
n_muts = muts[n.name]['aa_muts']
if 'muts' in muts[n.name]:
n_muts['nuc'] = muts[n.name]['muts'] # Put nuc mutations in with 'nuc' as the 'gene' so all can be searched together

if n.name not in clades: # This ensures every node gets an entry - otherwise auspice doesn't display nicely
clades[n.name]={"clade_membership": "Unassigned"}
for clade, definition in clade_designations.items():
if is_node_in_clade(definition, n_muts):
clades[n.name] = {"clade_annotation":clade, "clade_membership": clade}

# Ensures each node is set to membership of their parent initially (unless changed later in tree traversal)
for c in n:
clades[c.name]={"clade_membership": clades[n.name]["clade_membership"] }

return clades


def run(args):
## read tree and data, if reading data fails, return with error code
tree = Phylo.read(args.tree, 'newick')
node_data = read_node_data(args.mutations, args.tree)
if node_data is None:
print("ERROR: could not read node data (incl sequences)")
return 1

clade_designations = read_in_clade_definitions(args.clades)

muts = node_data['nodes']

clades = assign_clades(clade_designations, muts, tree)

write_json({'nodes':clades}, args.output)
print("clades written to", args.output, file=sys.stdout)
22 changes: 19 additions & 3 deletions augur/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,17 @@ def construct_mut(start, pos, end):
def assign_aa_vcf(tree, translations):
aa_muts = {}

#get mutations on the root
root = tree.root
aa_muts[root.name]={"aa_muts":{}}
for fname, prot in translations.items():
root_muts = prot['sequences'][root.name]
tmp = []
for pos in prot['positions']:
if pos in root_muts:
tmp.append(construct_mut(prot['reference'][pos], int(pos+1), root_muts[pos]))
aa_muts[root.name]["aa_muts"][fname] = tmp

for n in tree.get_nonterminals():
for c in n:
aa_muts[c.name]={"aa_muts":{}}
Expand Down Expand Up @@ -225,7 +236,7 @@ def run(args):
if any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
if not args.vcf_reference:
print("ERROR: a reference Fasta is required with VCF-format input")
return -1
return 1
compress_seq = read_vcf(args.ancestral_sequences, args.vcf_reference)
sequences = compress_seq['sequences']
ref = compress_seq['reference']
Expand All @@ -234,7 +245,7 @@ def run(args):
node_data = read_node_data(args.ancestral_sequences, args.tree)
if node_data is None:
print("ERROR: could not read node data (incl sequences)")
return -1
return 1
# extract sequences from node meta data
sequences = {}
for k,v in node_data['nodes'].items():
Expand All @@ -246,7 +257,7 @@ def run(args):
print("Read in {} features from reference sequence file".format(len(features)))
if features is None:
print("ERROR: could not read features of reference sequence file")
return -1
return 1

### translate every feature - but not 'nuc'!
translations = {}
Expand Down Expand Up @@ -282,6 +293,11 @@ def run(args):
aa_muts = assign_aa_vcf(tree, translations)
else:
aa_muts = {}

#fasta input shouldn't have mutations on root, so give empty entry
root = tree.get_nonterminals()[0]
aa_muts[root.name]={"aa_muts":{}}

for n in tree.get_nonterminals():
for c in n:
aa_muts[c.name]={"aa_muts":{}}
Expand Down
10 changes: 9 additions & 1 deletion bin/augur
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import sys
from augur import parse, filter, align, tree, refine, ancestral
from augur import traits, translate, mask, titers, export
from augur import validate, sequence_traits
from augur import validate, sequence_traits, clades

if __name__=="__main__":
import argparse
Expand Down Expand Up @@ -127,6 +127,14 @@ if __name__=="__main__":
translate_parser.add_argument('--vcf-reference', type=str, help='fasta file of the sequence the VCF was mapped to')
translate_parser.set_defaults(func=translate.run)

## CLADES.PY
clades_parser = subparsers.add_parser('clades', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
clades_parser.add_argument('--tree', help="prebuilt Newick -- no tree will be built if provided")
clades_parser.add_argument('--mutations', nargs='+', help='JSON(s) containing ancestral and tip nucleotide and/or amino-acid mutations ')
clades_parser.add_argument('--clades', type=str, help='TSV file containing clade definitions by amino-acid')
clades_parser.add_argument('--output', type=str, help="name of JSON files for clades")
clades_parser.set_defaults(func=clades.run)

## TRAITS.PY
traits_parser = subparsers.add_parser('traits', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
traits_parser.add_argument('--tree', '-t', required=True, help="tree to perform trait reconstruction on")
Expand Down
25 changes: 21 additions & 4 deletions builds/tb/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ rule config:
genes = "data/genes.txt",
colors = "data/color.tsv",
config = "data/config.json",
geo_info = "data/lat_longs.tsv"
geo_info = "data/lat_longs.tsv",
clades = "data/clades.tsv"

config = rules.config.params #so we can use config.x rather than rules.config.params.x
#end of config definition
Expand Down Expand Up @@ -117,14 +118,29 @@ rule translate:
--alignment-output {output.vcf_out}
"""

rule clades:
input:
tree = rules.refine.output.tree,
aa_muts = rules.translate.output.aa_data,
nuc_muts = rules.ancestral.output.nt_data,
clades = config.clades
output:
clade_data = "results/clades.json"
shell:
"""
augur clades --tree {input.tree} \
--mutations {input.nuc_muts} {input.aa_muts} \
--output {output.clade_data} --clades {input.clades}
"""

rule traits:
input:
tree = rules.refine.output.tree,
meta = config.meta
output:
"results/traits.json"
params:
traits = 'location cluster'
traits = 'location'
shell:
'augur traits --tree {input.tree} --metadata {input.meta}'
' --columns {params.traits} --output {output}'
Expand Down Expand Up @@ -159,7 +175,8 @@ rule export:
drms = rules.seqtraits.output.drm_data,
color_defs = config.colors,
config = config.config,
geo_info = config.geo_info
geo_info = config.geo_info,
clades = rules.clades.output.clade_data
output:
tree = rules.all.input.auspice_tree,
meta = rules.all.input.auspice_meta
Expand All @@ -168,7 +185,7 @@ rule export:
augur export \
--tree {input.tree} \
--metadata {input.metadata} \
--node-data {input.branch_lengths} {input.traits} {input.drms} {input.aa_muts} {input.nt_muts} \
--node-data {input.branch_lengths} {input.traits} {input.drms} {input.aa_muts} {input.nt_muts} {input.clades} \
--auspice-config {input.config} \
--colors {input.color_defs} \
--output-tree {output.tree} \
Expand Down
14 changes: 14 additions & 0 deletions builds/tb/data/clades.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
clade gene site alt
Mj-I ctpE 81 D
Mj-II nuc 30642 T
Mj-III.a nuc 444296 A
Mj-III.b nuc 1558108 T
Mj-III.c nuc 921390 C
Mj-IV.a pks8 634 T
Mj-IV.b Rv2752c 533 M
Mj-IV.c nuc 208073 T
Mj-V.a nuc 166518 A
Mj-V.b nuc 1497601 C
Mj-V.c nuc 1503915 T
Mj-V.d nuc 3825469 C
Mj-VI nuc 79748 G
15 changes: 14 additions & 1 deletion builds/tb/data/color.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,18 @@ cluster Mj-V.c #6a3d9a
cluster Mj-VI #fdbf6f
cluster Mn #ff7f00


clade_membership Mj-I #a6cee3
clade_membership Mj-II #fb9a99
clade_membership Mj-III.a #b15928
clade_membership Mj-III.b #ffff99
clade_membership Mj-III.c #00441b
clade_membership Mj-IV.a #e31a1c
clade_membership Mj-IV.b #33a02c
clade_membership Mj-IV.c #cab2d6
clade_membership Mj-V.a #1f78b4
clade_membership Mj-V.b #67001f
clade_membership Mj-V.d #b2df8a
clade_membership Mj-V.c #6a3d9a
clade_membership Mj-VI #fdbf6f
clade_membership Unassigned #AAAAAA

10 changes: 7 additions & 3 deletions builds/tb/data/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,21 @@
"type": "continuous",
"key": "num_date"
},
"location": {
"clade_membership": {
"menuItem": "Clade",
"legendTitle": "Clade",
"type": "discrete",
"key": "clade_membership"
},
"cluster": {
"location": {
}
},
"geo": [
"location"
],
"filters": [
"location",
"cluster",
"clade_membership",
"authors"
],
"maintainer": [
Expand Down
28 changes: 23 additions & 5 deletions tests/builds/tb/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ rule config:
genes = "data/genes.txt",
colors = "data/color.tsv",
config = "data/config.json",
geo_info = "data/lat_longs.tsv"
geo_info = "data/lat_longs.tsv",
clades = "data/clades.tsv"

config = rules.config.params #so we can use config.x rather than rules.config.params.x
#end of config definition
Expand Down Expand Up @@ -118,6 +119,21 @@ rule translate:
--alignment-output {output.vcf_out}
"""

rule clades:
input:
tree = rules.refine.output.tree,
aa_muts = rules.translate.output.aa_data,
nuc_muts = rules.ancestral.output.nt_data,
clades = config.clades
output:
clade_data = "results/clades.json"
shell:
"""
augur clades --tree {input.tree} \
--mutations {input.nuc_muts} {input.aa_muts} \
--output {output.clade_data} --clades {input.clades}
"""

rule traits:
input:
tree = rules.refine.output.tree,
Expand All @@ -141,7 +157,8 @@ rule export:
aa_muts = rules.translate.output.aa_data,
color_defs = config.colors,
config = config.config,
geo_info = config.geo_info
geo_info = config.geo_info,
clades = rules.clades.output.clade_data
output:
tree = rules.all.input.auspice_tree,
meta = rules.all.input.auspice_meta
Expand All @@ -150,7 +167,7 @@ rule export:
augur export \
--tree {input.tree} \
--metadata {input.metadata} \
--node-data {input.branch_lengths} {input.traits} {input.aa_muts} {input.nt_muts} \
--node-data {input.branch_lengths} {input.traits} {input.aa_muts} {input.nt_muts} {input.clades} \
--auspice-config {input.config} \
--colors {input.color_defs} \
--lat-longs {input.geo_info} \
Expand All @@ -170,7 +187,8 @@ rule exportv2:
nt_muts = rules.ancestral.output.nt_data,
aa_muts = rules.translate.output.aa_data,
color_defs = config.colors,
geo_info = config.geo_info
geo_info = config.geo_info,
clades = rules.clades.output.clade_data
output:
main = rules.all.input.auspice_main
params:
Expand All @@ -186,7 +204,7 @@ rule exportv2:
--new-schema \
--tree {input.tree} \
--metadata {input.metadata} \
--node-data {input.branch_lengths} {input.traits} {input.aa_muts} {input.nt_muts} \
--node-data {input.branch_lengths} {input.traits} {input.aa_muts} {input.nt_muts} {input.clades} \
--colors {input.color_defs} \
--lat-longs {input.geo_info} \
--output-main {output.main} \
Expand Down
14 changes: 14 additions & 0 deletions tests/builds/tb/data/clades.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
clade gene site alt
Mj-I ctpE 81 D
Mj-II nuc 30642 T
Mj-III.a nuc 444296 A
Mj-III.b nuc 1558108 T
Mj-III.c nuc 921390 C
Mj-IV.a pks8 634 T
Mj-IV.b Rv2752c 533 M
Mj-IV.c nuc 208073 T
Mj-V.a nuc 166518 A
Mj-V.b nuc 1497601 C
Mj-V.c nuc 1503915 T
Mj-V.d nuc 3825469 C
Mj-VI nuc 79748 G
Loading