Skip to content

Commit

Permalink
Initial commit including snakemake and config
Browse files Browse the repository at this point in the history
  • Loading branch information
trvrb committed Dec 31, 2018
0 parents commit 3e0f9fe
Show file tree
Hide file tree
Showing 14 changed files with 2,022 additions and 0 deletions.
41 changes: 41 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Files created by the pipeline, which we want to keep out of git
# (or at least out of _this_ git repo).
data/
results/
auspice/
build/

# Sensitive environment variables
environment*

# Snakemake state dir
/.snakemake

# Local config overrides
/config_local.yaml

# For Python #
##############
*.pyc
.tox/
.cache/

# Compiled source #
###################
*.com
*.class
*.dll
*.exe
*.o
*.so

# OS generated files #
######################
.DS_Store
.DS_Store?
._*
.Spotlight-V100
.Trashes
Icon?
ehthumbs.db
Thumbs.db
283 changes: 283 additions & 0 deletions Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,283 @@
serotypes = ['all', 'denv1', 'denv2', 'denv3', 'denv4']

rule all:
input:
auspice_tree = expand("auspice/dengue_{serotype}_tree.json", serotype=serotypes),
auspice_meta = expand("auspice/dengue_{serotype}_meta.json", serotype=serotypes),

rule files:
params:
dropped_strains = "config/dropped_strains.txt",
reference = "config/reference_dengue_{serotype}.gb",
clades = "config/clades.tsv",
auspice_config = "config/auspice_config_{serotype}.json"

files = rules.files.params

def serotype_integer(w):
serotypes = {'denv1': '1', 'denv2': '2', 'denv3': '3', 'denv4': '4'}
return serotypes[w.serotype]

rule download:
message: "Downloading sequences from fauna"
output:
sequences = "data/dengue_{serotype}.fasta"
params:
fasta_fields = "strain virus accession collection_date region country division location source locus authors url title journal puburl",
serotype_integer = serotype_integer
shell:
"""
python3 ../fauna/vdb/download.py \
--database vdb \
--virus dengue \
--fasta_fields {params.fasta_fields} \
--select serotype:{params.serotype_integer} \
--path $(dirname {output.sequences}) \
--fstem $(basename {output.sequences} .fasta)
"""

rule concat:
message: "Concatenating serotype sequences"
input:
denv1_sequences = "data/dengue_denv1.fasta",
denv2_sequences = "data/dengue_denv2.fasta",
denv3_sequences = "data/dengue_denv3.fasta",
denv4_sequences = "data/dengue_denv4.fasta"
output:
sequences = "data/dengue_all.fasta"
shell:
"""
cat {input.denv1_sequences} \
{input.denv2_sequences} \
{input.denv3_sequences} \
{input.denv4_sequences} \
> {output.sequences}
"""

rule parse:
message: "Parsing fasta into sequences and metadata"
input:
sequences = rules.download.output.sequences
output:
sequences = "results/sequences_{serotype}.fasta",
metadata = "results/metadata_{serotype}.tsv"
params:
fasta_fields = "strain virus accession date region country division city db segment authors url title journal paper_url"
shell:
"""
augur parse \
--sequences {input.sequences} \
--output-sequences {output.sequences} \
--output-metadata {output.metadata} \
--fields {params.fasta_fields}
"""

rule filter:
message:
"""
Filtering to
- {params.sequences_per_group} sequence(s) per {params.group_by!s}
- excluding strains in {input.exclude}
- minimum genome length of {params.min_length}
"""
input:
sequences = rules.parse.output.sequences,
metadata = rules.parse.output.metadata,
exclude = files.dropped_strains
output:
sequences = "results/filtered_{serotype}.fasta"
params:
group_by = "year",
sequences_per_group = 30,
min_length = 5000
shell:
"""
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--exclude {input.exclude} \
--output {output.sequences} \
--group-by {params.group_by} \
--sequences-per-group {params.sequences_per_group} \
--min-length {params.min_length}
"""

rule align:
message:
"""
Aligning sequences to {input.reference}
- filling gaps with N
"""
input:
sequences = rules.filter.output.sequences,
reference = files.reference
output:
alignment = "results/aligned_{serotype}.fasta"
shell:
"""
augur align \
--sequences {input.sequences} \
--reference-sequence {input.reference} \
--output {output.alignment} \
--fill-gaps \
--remove-reference
"""

rule tree:
message: "Building tree"
input:
alignment = rules.align.output.alignment
output:
tree = "results/tree-raw_{serotype}.nwk"
shell:
"""
augur tree \
--alignment {input.alignment} \
--output {output.tree}
"""

rule refine:
message:
"""
Refining tree
- estimate timetree
- use {params.coalescent} coalescent timescale
- estimate {params.date_inference} node dates
- filter tips more than {params.clock_filter_iqd} IQDs from clock expectation
"""
input:
tree = rules.tree.output.tree,
alignment = rules.align.output,
metadata = rules.parse.output.metadata
output:
tree = "results/tree_{serotype}.nwk",
node_data = "results/branch-lengths_{serotype}.json"
params:
coalescent = "const",
date_inference = "marginal",
clock_filter_iqd = 4
shell:
"""
augur refine \
--tree {input.tree} \
--alignment {input.alignment} \
--metadata {input.metadata} \
--output-tree {output.tree} \
--output-node-data {output.node_data} \
--timetree \
--coalescent {params.coalescent} \
--date-confidence \
--date-inference {params.date_inference} \
--clock-filter-iqd {params.clock_filter_iqd}
"""

rule ancestral:
message: "Reconstructing ancestral sequences and mutations"
input:
tree = rules.refine.output.tree,
alignment = rules.align.output
output:
node_data = "results/nt-muts_{serotype}.json"
params:
inference = "joint"
shell:
"""
augur ancestral \
--tree {input.tree} \
--alignment {input.alignment} \
--output {output.node_data} \
--inference {params.inference}
"""

rule translate:
message: "Translating amino acid sequences"
input:
tree = rules.refine.output.tree,
node_data = rules.ancestral.output.node_data,
reference = files.reference
output:
node_data = "results/aa-muts_{serotype}.json"
shell:
"""
augur translate \
--tree {input.tree} \
--ancestral-sequences {input.node_data} \
--reference-sequence {input.reference} \
--output {output.node_data} \
"""

rule traits:
message:
"""
Inferring ancestral traits for {params.columns!s}
- increase uncertainty of reconstruction by {params.sampling_bias_correction} to partially account for sampling bias
"""
input:
tree = rules.refine.output.tree,
metadata = rules.parse.output.metadata
output:
node_data = "results/traits_{serotype}.json",
params:
columns = "region country",
sampling_bias_correction = 3
shell:
"""
augur traits \
--tree {input.tree} \
--metadata {input.metadata} \
--output {output.node_data} \
--columns {params.columns} \
--confidence \
--sampling-bias-correction {params.sampling_bias_correction}
"""

rule clades:
message: "Annotating clades"
input:
tree = rules.refine.output.tree,
nt_muts = rules.ancestral.output,
aa_muts = rules.translate.output,
clades = files.clades
output:
clades = "results/clades_{serotype}.json"
shell:
"""
augur clades \
--tree {input.tree} \
--mutations {input.nt_muts} {input.aa_muts} \
--clades {input.clades} \
--output {output.clades}
"""

rule export:
message: "Exporting data files for for auspice"
input:
tree = rules.refine.output.tree,
metadata = rules.parse.output.metadata,
branch_lengths = rules.refine.output.node_data,
traits = rules.traits.output.node_data,
clades = rules.clades.output.clades,
nt_muts = rules.ancestral.output.node_data,
aa_muts = rules.translate.output.node_data,
auspice_config = files.auspice_config
output:
auspice_tree = "auspice/dengue_{serotype}_tree.json",
auspice_meta = "auspice/dengue_{serotype}_meta.json"
shell:
"""
augur export \
--tree {input.tree} \
--metadata {input.metadata} \
--node-data {input.branch_lengths} {input.traits} {input.clades} {input.nt_muts} {input.aa_muts} \
--auspice-config {input.auspice_config} \
--output-tree {output.auspice_tree} \
--output-meta {output.auspice_meta}
"""

rule clean:
message: "Removing directories: {params}"
params:
"results ",
"auspice"
shell:
"rm -rfv {params}"
52 changes: 52 additions & 0 deletions config/auspice_config_all.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{
"title": "Real-time tracking of dengue virus evolution",
"color_options": {
"gt": {
"menuItem": "genotype",
"legendTitle": "Genotype",
"type": "discrete",
"key": "genotype"
},
"num_date": {
"menuItem": "date",
"legendTitle": "Sampling date",
"type": "continuous",
"key": "num_date"
},
"country": {
"menuItem": "country",
"legendTitle": "Country",
"type": "discrete",
"key": "country"
},
"region": {
"menuItem": "region",
"legendTitle": "Region",
"type": "discrete",
"key": "region"
},
"clade_membership": {
"key": "clade_membership",
"legendTitle": "Serotype",
"menuItem": "serotype",
"type": "discrete"
}
},
"geo": [
"country",
"region"
],
"defaults": {
"mapTriplicate": true,
"colorBy": "clade_membership"
},
"maintainer": [
"Trevor Bedford",
"http://bedford.io/team/trevor-bedford/"
],
"filters": [
"country",
"region",
"authors"
]
}
Loading

0 comments on commit 3e0f9fe

Please sign in to comment.