-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initial commit including snakemake and config
- Loading branch information
0 parents
commit 3e0f9fe
Showing
14 changed files
with
2,022 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
# Files created by the pipeline, which we want to keep out of git | ||
# (or at least out of _this_ git repo). | ||
data/ | ||
results/ | ||
auspice/ | ||
build/ | ||
|
||
# Sensitive environment variables | ||
environment* | ||
|
||
# Snakemake state dir | ||
/.snakemake | ||
|
||
# Local config overrides | ||
/config_local.yaml | ||
|
||
# For Python # | ||
############## | ||
*.pyc | ||
.tox/ | ||
.cache/ | ||
|
||
# Compiled source # | ||
################### | ||
*.com | ||
*.class | ||
*.dll | ||
*.exe | ||
*.o | ||
*.so | ||
|
||
# OS generated files # | ||
###################### | ||
.DS_Store | ||
.DS_Store? | ||
._* | ||
.Spotlight-V100 | ||
.Trashes | ||
Icon? | ||
ehthumbs.db | ||
Thumbs.db |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,283 @@ | ||
serotypes = ['all', 'denv1', 'denv2', 'denv3', 'denv4'] | ||
|
||
rule all: | ||
input: | ||
auspice_tree = expand("auspice/dengue_{serotype}_tree.json", serotype=serotypes), | ||
auspice_meta = expand("auspice/dengue_{serotype}_meta.json", serotype=serotypes), | ||
|
||
rule files: | ||
params: | ||
dropped_strains = "config/dropped_strains.txt", | ||
reference = "config/reference_dengue_{serotype}.gb", | ||
clades = "config/clades.tsv", | ||
auspice_config = "config/auspice_config_{serotype}.json" | ||
|
||
files = rules.files.params | ||
|
||
def serotype_integer(w): | ||
serotypes = {'denv1': '1', 'denv2': '2', 'denv3': '3', 'denv4': '4'} | ||
return serotypes[w.serotype] | ||
|
||
rule download: | ||
message: "Downloading sequences from fauna" | ||
output: | ||
sequences = "data/dengue_{serotype}.fasta" | ||
params: | ||
fasta_fields = "strain virus accession collection_date region country division location source locus authors url title journal puburl", | ||
serotype_integer = serotype_integer | ||
shell: | ||
""" | ||
python3 ../fauna/vdb/download.py \ | ||
--database vdb \ | ||
--virus dengue \ | ||
--fasta_fields {params.fasta_fields} \ | ||
--select serotype:{params.serotype_integer} \ | ||
--path $(dirname {output.sequences}) \ | ||
--fstem $(basename {output.sequences} .fasta) | ||
""" | ||
|
||
rule concat: | ||
message: "Concatenating serotype sequences" | ||
input: | ||
denv1_sequences = "data/dengue_denv1.fasta", | ||
denv2_sequences = "data/dengue_denv2.fasta", | ||
denv3_sequences = "data/dengue_denv3.fasta", | ||
denv4_sequences = "data/dengue_denv4.fasta" | ||
output: | ||
sequences = "data/dengue_all.fasta" | ||
shell: | ||
""" | ||
cat {input.denv1_sequences} \ | ||
{input.denv2_sequences} \ | ||
{input.denv3_sequences} \ | ||
{input.denv4_sequences} \ | ||
> {output.sequences} | ||
""" | ||
|
||
rule parse: | ||
message: "Parsing fasta into sequences and metadata" | ||
input: | ||
sequences = rules.download.output.sequences | ||
output: | ||
sequences = "results/sequences_{serotype}.fasta", | ||
metadata = "results/metadata_{serotype}.tsv" | ||
params: | ||
fasta_fields = "strain virus accession date region country division city db segment authors url title journal paper_url" | ||
shell: | ||
""" | ||
augur parse \ | ||
--sequences {input.sequences} \ | ||
--output-sequences {output.sequences} \ | ||
--output-metadata {output.metadata} \ | ||
--fields {params.fasta_fields} | ||
""" | ||
|
||
rule filter: | ||
message: | ||
""" | ||
Filtering to | ||
- {params.sequences_per_group} sequence(s) per {params.group_by!s} | ||
- excluding strains in {input.exclude} | ||
- minimum genome length of {params.min_length} | ||
""" | ||
input: | ||
sequences = rules.parse.output.sequences, | ||
metadata = rules.parse.output.metadata, | ||
exclude = files.dropped_strains | ||
output: | ||
sequences = "results/filtered_{serotype}.fasta" | ||
params: | ||
group_by = "year", | ||
sequences_per_group = 30, | ||
min_length = 5000 | ||
shell: | ||
""" | ||
augur filter \ | ||
--sequences {input.sequences} \ | ||
--metadata {input.metadata} \ | ||
--exclude {input.exclude} \ | ||
--output {output.sequences} \ | ||
--group-by {params.group_by} \ | ||
--sequences-per-group {params.sequences_per_group} \ | ||
--min-length {params.min_length} | ||
""" | ||
|
||
rule align: | ||
message: | ||
""" | ||
Aligning sequences to {input.reference} | ||
- filling gaps with N | ||
""" | ||
input: | ||
sequences = rules.filter.output.sequences, | ||
reference = files.reference | ||
output: | ||
alignment = "results/aligned_{serotype}.fasta" | ||
shell: | ||
""" | ||
augur align \ | ||
--sequences {input.sequences} \ | ||
--reference-sequence {input.reference} \ | ||
--output {output.alignment} \ | ||
--fill-gaps \ | ||
--remove-reference | ||
""" | ||
|
||
rule tree: | ||
message: "Building tree" | ||
input: | ||
alignment = rules.align.output.alignment | ||
output: | ||
tree = "results/tree-raw_{serotype}.nwk" | ||
shell: | ||
""" | ||
augur tree \ | ||
--alignment {input.alignment} \ | ||
--output {output.tree} | ||
""" | ||
|
||
rule refine: | ||
message: | ||
""" | ||
Refining tree | ||
- estimate timetree | ||
- use {params.coalescent} coalescent timescale | ||
- estimate {params.date_inference} node dates | ||
- filter tips more than {params.clock_filter_iqd} IQDs from clock expectation | ||
""" | ||
input: | ||
tree = rules.tree.output.tree, | ||
alignment = rules.align.output, | ||
metadata = rules.parse.output.metadata | ||
output: | ||
tree = "results/tree_{serotype}.nwk", | ||
node_data = "results/branch-lengths_{serotype}.json" | ||
params: | ||
coalescent = "const", | ||
date_inference = "marginal", | ||
clock_filter_iqd = 4 | ||
shell: | ||
""" | ||
augur refine \ | ||
--tree {input.tree} \ | ||
--alignment {input.alignment} \ | ||
--metadata {input.metadata} \ | ||
--output-tree {output.tree} \ | ||
--output-node-data {output.node_data} \ | ||
--timetree \ | ||
--coalescent {params.coalescent} \ | ||
--date-confidence \ | ||
--date-inference {params.date_inference} \ | ||
--clock-filter-iqd {params.clock_filter_iqd} | ||
""" | ||
|
||
rule ancestral: | ||
message: "Reconstructing ancestral sequences and mutations" | ||
input: | ||
tree = rules.refine.output.tree, | ||
alignment = rules.align.output | ||
output: | ||
node_data = "results/nt-muts_{serotype}.json" | ||
params: | ||
inference = "joint" | ||
shell: | ||
""" | ||
augur ancestral \ | ||
--tree {input.tree} \ | ||
--alignment {input.alignment} \ | ||
--output {output.node_data} \ | ||
--inference {params.inference} | ||
""" | ||
|
||
rule translate: | ||
message: "Translating amino acid sequences" | ||
input: | ||
tree = rules.refine.output.tree, | ||
node_data = rules.ancestral.output.node_data, | ||
reference = files.reference | ||
output: | ||
node_data = "results/aa-muts_{serotype}.json" | ||
shell: | ||
""" | ||
augur translate \ | ||
--tree {input.tree} \ | ||
--ancestral-sequences {input.node_data} \ | ||
--reference-sequence {input.reference} \ | ||
--output {output.node_data} \ | ||
""" | ||
|
||
rule traits: | ||
message: | ||
""" | ||
Inferring ancestral traits for {params.columns!s} | ||
- increase uncertainty of reconstruction by {params.sampling_bias_correction} to partially account for sampling bias | ||
""" | ||
input: | ||
tree = rules.refine.output.tree, | ||
metadata = rules.parse.output.metadata | ||
output: | ||
node_data = "results/traits_{serotype}.json", | ||
params: | ||
columns = "region country", | ||
sampling_bias_correction = 3 | ||
shell: | ||
""" | ||
augur traits \ | ||
--tree {input.tree} \ | ||
--metadata {input.metadata} \ | ||
--output {output.node_data} \ | ||
--columns {params.columns} \ | ||
--confidence \ | ||
--sampling-bias-correction {params.sampling_bias_correction} | ||
""" | ||
|
||
rule clades: | ||
message: "Annotating clades" | ||
input: | ||
tree = rules.refine.output.tree, | ||
nt_muts = rules.ancestral.output, | ||
aa_muts = rules.translate.output, | ||
clades = files.clades | ||
output: | ||
clades = "results/clades_{serotype}.json" | ||
shell: | ||
""" | ||
augur clades \ | ||
--tree {input.tree} \ | ||
--mutations {input.nt_muts} {input.aa_muts} \ | ||
--clades {input.clades} \ | ||
--output {output.clades} | ||
""" | ||
|
||
rule export: | ||
message: "Exporting data files for for auspice" | ||
input: | ||
tree = rules.refine.output.tree, | ||
metadata = rules.parse.output.metadata, | ||
branch_lengths = rules.refine.output.node_data, | ||
traits = rules.traits.output.node_data, | ||
clades = rules.clades.output.clades, | ||
nt_muts = rules.ancestral.output.node_data, | ||
aa_muts = rules.translate.output.node_data, | ||
auspice_config = files.auspice_config | ||
output: | ||
auspice_tree = "auspice/dengue_{serotype}_tree.json", | ||
auspice_meta = "auspice/dengue_{serotype}_meta.json" | ||
shell: | ||
""" | ||
augur export \ | ||
--tree {input.tree} \ | ||
--metadata {input.metadata} \ | ||
--node-data {input.branch_lengths} {input.traits} {input.clades} {input.nt_muts} {input.aa_muts} \ | ||
--auspice-config {input.auspice_config} \ | ||
--output-tree {output.auspice_tree} \ | ||
--output-meta {output.auspice_meta} | ||
""" | ||
|
||
rule clean: | ||
message: "Removing directories: {params}" | ||
params: | ||
"results ", | ||
"auspice" | ||
shell: | ||
"rm -rfv {params}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
{ | ||
"title": "Real-time tracking of dengue virus evolution", | ||
"color_options": { | ||
"gt": { | ||
"menuItem": "genotype", | ||
"legendTitle": "Genotype", | ||
"type": "discrete", | ||
"key": "genotype" | ||
}, | ||
"num_date": { | ||
"menuItem": "date", | ||
"legendTitle": "Sampling date", | ||
"type": "continuous", | ||
"key": "num_date" | ||
}, | ||
"country": { | ||
"menuItem": "country", | ||
"legendTitle": "Country", | ||
"type": "discrete", | ||
"key": "country" | ||
}, | ||
"region": { | ||
"menuItem": "region", | ||
"legendTitle": "Region", | ||
"type": "discrete", | ||
"key": "region" | ||
}, | ||
"clade_membership": { | ||
"key": "clade_membership", | ||
"legendTitle": "Serotype", | ||
"menuItem": "serotype", | ||
"type": "discrete" | ||
} | ||
}, | ||
"geo": [ | ||
"country", | ||
"region" | ||
], | ||
"defaults": { | ||
"mapTriplicate": true, | ||
"colorBy": "clade_membership" | ||
}, | ||
"maintainer": [ | ||
"Trevor Bedford", | ||
"http://bedford.io/team/trevor-bedford/" | ||
], | ||
"filters": [ | ||
"country", | ||
"region", | ||
"authors" | ||
] | ||
} |
Oops, something went wrong.