Initial commit including snakemake and config

nextstrain · Dec 31, 2018 · 3e0f9fe · 3e0f9fe
commit 3e0f9fe
Show file tree

Hide file tree

Showing 14 changed files with 2,022 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,41 @@
+# Files created by the pipeline, which we want to keep out of git
+# (or at least out of _this_ git repo).
+data/
+results/
+auspice/
+build/
+
+# Sensitive environment variables
+environment*
+
+# Snakemake state dir
+/.snakemake
+
+# Local config overrides
+/config_local.yaml
+
+# For Python #
+##############
+*.pyc
+.tox/
+.cache/
+
+# Compiled source #
+###################
+*.com
+*.class
+*.dll
+*.exe
+*.o
+*.so
+
+# OS generated files #
+######################
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+Icon?
+ehthumbs.db
+Thumbs.db
diff --git a/Snakefile b/Snakefile
@@ -0,0 +1,283 @@
+serotypes = ['all', 'denv1', 'denv2', 'denv3', 'denv4']
+
+rule all:
+    input:
+        auspice_tree = expand("auspice/dengue_{serotype}_tree.json", serotype=serotypes),
+        auspice_meta = expand("auspice/dengue_{serotype}_meta.json", serotype=serotypes),
+
+rule files:
+    params:
+        dropped_strains = "config/dropped_strains.txt",
+        reference = "config/reference_dengue_{serotype}.gb",
+        clades = "config/clades.tsv",
+        auspice_config = "config/auspice_config_{serotype}.json"
+
+files = rules.files.params
+
+def serotype_integer(w):
+    serotypes = {'denv1': '1', 'denv2': '2', 'denv3': '3', 'denv4': '4'}
+    return serotypes[w.serotype]
+
+rule download:
+    message: "Downloading sequences from fauna"
+    output:
+        sequences = "data/dengue_{serotype}.fasta"
+    params:
+        fasta_fields = "strain virus accession collection_date region country division location source locus authors url title journal puburl",
+        serotype_integer = serotype_integer
+    shell:
+        """
+        python3 ../fauna/vdb/download.py \
+            --database vdb \
+            --virus dengue \
+            --fasta_fields {params.fasta_fields} \
+            --select serotype:{params.serotype_integer} \
+            --path $(dirname {output.sequences}) \
+            --fstem $(basename {output.sequences} .fasta)
+        """
+
+rule concat:
+    message: "Concatenating serotype sequences"
+    input:
+        denv1_sequences = "data/dengue_denv1.fasta",
+        denv2_sequences = "data/dengue_denv2.fasta",
+        denv3_sequences = "data/dengue_denv3.fasta",
+        denv4_sequences = "data/dengue_denv4.fasta"
+    output:
+        sequences = "data/dengue_all.fasta"
+    shell:
+        """
+        cat {input.denv1_sequences} \
+            {input.denv2_sequences} \
+            {input.denv3_sequences} \
+            {input.denv4_sequences} \
+            > {output.sequences}
+        """
+
+rule parse:
+    message: "Parsing fasta into sequences and metadata"
+    input:
+        sequences = rules.download.output.sequences
+    output:
+        sequences = "results/sequences_{serotype}.fasta",
+        metadata = "results/metadata_{serotype}.tsv"
+    params:
+        fasta_fields = "strain virus accession date region country division city db segment authors url title journal paper_url"
+    shell:
+        """
+        augur parse \
+            --sequences {input.sequences} \
+            --output-sequences {output.sequences} \
+            --output-metadata {output.metadata} \
+            --fields {params.fasta_fields}
+        """
+
+rule filter:
+    message:
+        """
+        Filtering to
+          - {params.sequences_per_group} sequence(s) per {params.group_by!s}
+          - excluding strains in {input.exclude}
+          - minimum genome length of {params.min_length}
+        """
+    input:
+        sequences = rules.parse.output.sequences,
+        metadata = rules.parse.output.metadata,
+        exclude = files.dropped_strains
+    output:
+        sequences = "results/filtered_{serotype}.fasta"
+    params:
+        group_by = "year",
+        sequences_per_group = 30,
+        min_length = 5000
+    shell:
+        """
+        augur filter \
+            --sequences {input.sequences} \
+            --metadata {input.metadata} \
+            --exclude {input.exclude} \
+            --output {output.sequences} \
+            --group-by {params.group_by} \
+            --sequences-per-group {params.sequences_per_group} \
+            --min-length {params.min_length}
+        """
+
+rule align:
+    message:
+        """
+        Aligning sequences to {input.reference}
+          - filling gaps with N
+        """
+    input:
+        sequences = rules.filter.output.sequences,
+        reference = files.reference
+    output:
+        alignment = "results/aligned_{serotype}.fasta"
+    shell:
+        """
+        augur align \
+            --sequences {input.sequences} \
+            --reference-sequence {input.reference} \
+            --output {output.alignment} \
+            --fill-gaps \
+            --remove-reference
+        """
+
+rule tree:
+    message: "Building tree"
+    input:
+        alignment = rules.align.output.alignment
+    output:
+        tree = "results/tree-raw_{serotype}.nwk"
+    shell:
+        """
+        augur tree \
+            --alignment {input.alignment} \
+            --output {output.tree}
+        """
+
+rule refine:
+    message:
+        """
+        Refining tree
+          - estimate timetree
+          - use {params.coalescent} coalescent timescale
+          - estimate {params.date_inference} node dates
+          - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation
+        """
+    input:
+        tree = rules.tree.output.tree,
+        alignment = rules.align.output,
+        metadata = rules.parse.output.metadata
+    output:
+        tree = "results/tree_{serotype}.nwk",
+        node_data = "results/branch-lengths_{serotype}.json"
+    params:
+        coalescent = "const",
+        date_inference = "marginal",
+        clock_filter_iqd = 4
+    shell:
+        """
+        augur refine \
+            --tree {input.tree} \
+            --alignment {input.alignment} \
+            --metadata {input.metadata} \
+            --output-tree {output.tree} \
+            --output-node-data {output.node_data} \
+            --timetree \
+            --coalescent {params.coalescent} \
+            --date-confidence \
+            --date-inference {params.date_inference} \
+            --clock-filter-iqd {params.clock_filter_iqd}
+        """
+
+rule ancestral:
+    message: "Reconstructing ancestral sequences and mutations"
+    input:
+        tree = rules.refine.output.tree,
+        alignment = rules.align.output
+    output:
+        node_data = "results/nt-muts_{serotype}.json"
+    params:
+        inference = "joint"
+    shell:
+        """
+        augur ancestral \
+            --tree {input.tree} \
+            --alignment {input.alignment} \
+            --output {output.node_data} \
+            --inference {params.inference}
+        """
+
+rule translate:
+    message: "Translating amino acid sequences"
+    input:
+        tree = rules.refine.output.tree,
+        node_data = rules.ancestral.output.node_data,
+        reference = files.reference
+    output:
+        node_data = "results/aa-muts_{serotype}.json"
+    shell:
+        """
+        augur translate \
+            --tree {input.tree} \
+            --ancestral-sequences {input.node_data} \
+            --reference-sequence {input.reference} \
+            --output {output.node_data} \
+        """
+
+rule traits:
+    message:
+        """
+        Inferring ancestral traits for {params.columns!s}
+          - increase uncertainty of reconstruction by {params.sampling_bias_correction} to partially account for sampling bias
+        """
+    input:
+        tree = rules.refine.output.tree,
+        metadata = rules.parse.output.metadata
+    output:
+        node_data = "results/traits_{serotype}.json",
+    params:
+        columns = "region country",
+        sampling_bias_correction = 3
+    shell:
+        """
+        augur traits \
+            --tree {input.tree} \
+            --metadata {input.metadata} \
+            --output {output.node_data} \
+            --columns {params.columns} \
+            --confidence \
+            --sampling-bias-correction {params.sampling_bias_correction}
+        """
+
+rule clades:
+    message: "Annotating clades"
+    input:
+        tree = rules.refine.output.tree,
+        nt_muts = rules.ancestral.output,
+        aa_muts = rules.translate.output,
+        clades = files.clades
+    output:
+        clades = "results/clades_{serotype}.json"
+    shell:
+        """
+        augur clades \
+            --tree {input.tree} \
+            --mutations {input.nt_muts} {input.aa_muts} \
+            --clades {input.clades} \
+            --output {output.clades}
+        """
+
+rule export:
+    message: "Exporting data files for for auspice"
+    input:
+        tree = rules.refine.output.tree,
+        metadata = rules.parse.output.metadata,
+        branch_lengths = rules.refine.output.node_data,
+        traits = rules.traits.output.node_data,
+        clades = rules.clades.output.clades,
+        nt_muts = rules.ancestral.output.node_data,
+        aa_muts = rules.translate.output.node_data,
+        auspice_config = files.auspice_config
+    output:
+        auspice_tree = "auspice/dengue_{serotype}_tree.json",
+        auspice_meta = "auspice/dengue_{serotype}_meta.json"
+    shell:
+        """
+        augur export \
+            --tree {input.tree} \
+            --metadata {input.metadata} \
+            --node-data {input.branch_lengths} {input.traits} {input.clades} {input.nt_muts} {input.aa_muts} \
+            --auspice-config {input.auspice_config} \
+            --output-tree {output.auspice_tree} \
+            --output-meta {output.auspice_meta}
+        """
+
+rule clean:
+    message: "Removing directories: {params}"
+    params:
+        "results ",
+        "auspice"
+    shell:
+        "rm -rfv {params}"
diff --git a/config/auspice_config_all.json b/config/auspice_config_all.json
@@ -0,0 +1,52 @@
+{
+  "title": "Real-time tracking of dengue virus evolution",
+  "color_options": {
+    "gt": {
+      "menuItem": "genotype",
+      "legendTitle": "Genotype",
+      "type": "discrete",
+      "key": "genotype"
+    },
+    "num_date": {
+      "menuItem": "date",
+      "legendTitle": "Sampling date",
+      "type": "continuous",
+      "key": "num_date"
+    },
+    "country": {
+      "menuItem": "country",
+      "legendTitle": "Country",
+      "type": "discrete",
+      "key": "country"
+    },
+    "region": {
+      "menuItem": "region",
+      "legendTitle": "Region",
+      "type": "discrete",
+      "key": "region"
+    },
+    "clade_membership": {
+      "key": "clade_membership",
+      "legendTitle": "Serotype",
+      "menuItem": "serotype",
+      "type": "discrete"
+    }
+  },
+  "geo": [
+    "country",
+    "region"
+  ],
+  "defaults": {
+    "mapTriplicate": true,
+    "colorBy": "clade_membership"
+  },
+  "maintainer": [
+    "Trevor Bedford",
+    "http://bedford.io/team/trevor-bedford/"
+  ],
+  "filters": [
+    "country",
+    "region",
+    "authors"
+  ]
+}