nextstrain · j23414 · Apr 1, 2022 · Mar 25, 2022 · tsibley · Apr 1, 2022
diff --git a/.travis.yml b/.travis.yml
@@ -7,11 +7,24 @@ python:
 before_install:
   - python3 -m pip install --upgrade pip setuptools wheel
 install:
-  - pip3 install git+https://github.com/nextstrain/cli
+  # https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/use-conda-with-travis-ci.html#the-travis-yml-file
+  - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
+  - bash miniconda.sh -b -p $HOME/miniconda
+  - source "$HOME/miniconda/etc/profile.d/conda.sh"
+  - hash -r
+  - conda config --set always_yes yes --set changeps1 no
+  - conda update -q conda
+  # Useful for debugging any issues with conda
+  - conda info -a
+  # Install nextstrain cli
+  - conda install -n base -c conda-forge mamba --yes
+  - conda activate base
+  - mamba create -n nextstrain -c conda-forge -c bioconda nextstrain-cli augur auspice nextalign snakemake git --yes
+  - conda activate nextstrain
   - nextstrain version
   - nextstrain check-setup
   - nextstrain update
 script:
   - mkdir -p data/
-  - cp -v example_data/measles.fasta data/
+  - cp -v example_data/* data/.
   - nextstrain build .
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@ This is the [Nextstrain](https://nextstrain.org) build for measles virus, visibl
 The build encompasses fetching data, preparing it for analysis, doing quality
 control, performing analyses, and saving the results in a format suitable for
 visualization (with [auspice][]).  This involves running components of
-Nextstrain such as [fauna][] and [augur][].
+Nextstrain such as [augur][].
 
 All measles-specific steps and functionality for the Nextstrain pipeline should be
 housed in this repository.
@@ -42,22 +42,23 @@ Configuration takes place entirely with the `Snakefile`. This can be read top-to
 specifies its file inputs and output and also its parameters. There is little redirection and each
 rule should be able to be reasoned with on its own.
 
-
+<!--
 ### fauna / RethinkDB credentials
 
 This build starts by pulling sequences from our live [fauna][] database (a RethinkDB instance). This
 requires environment variables `RETHINK_HOST` and `RETHINK_AUTH_KEY` to be set.
+-->
 
-If you don't have access to our database, you can run the build using the
+If you don't have access to our https endpoints, you can run the build using the
 example data provided in this repository.  Before running the build, copy the
 example sequences into the `data/` directory like so:
 
     mkdir -p data/
-    cp example_data/measles.fasta data/
+    cp example_data/* data/.
 
 
 [Nextstrain]: https://nextstrain.org
-[fauna]: https://github.com/nextstrain/fauna
+<!-- [fauna]: https://github.com/nextstrain/fauna -->
 [augur]: https://github.com/nextstrain/augur
 [auspice]: https://github.com/nextstrain/auspice
 [snakemake cli]: https://snakemake.readthedocs.io/en/stable/executable.html#all-options

diff --git a/Snakefile b/Snakefile
@@ -13,40 +13,31 @@ rule files:
 files = rules.files.params
 
 rule download:
-    message: "Downloading sequences from fauna"
+    message: "Downloading sequences and metadata from data.nextstrain.org"
     output:
-        sequences = "data/measles.fasta"
+        sequences = "data/sequences.fasta.xz",
+        metadata = "data/metadata.tsv.gz"
     params:
-        fasta_fields = "strain virus accession collection_date region country division location source locus authors url title journal puburl"
+        sequences_url = "https://data.nextstrain.org/files/measles/sequences.fasta.xz",
+        metadata_url = "https://data.nextstrain.org/files/measles/metadata.tsv.gz"
     shell:
         """
-        python3 ../fauna/vdb/download.py \
-            --database vdb \
-            --virus measles \
-            --fasta_fields {params.fasta_fields} \
-            --resolve_method choose_genbank \
-            --path $(dirname {output.sequences}) \
-            --fstem $(basename {output.sequences} .fasta)
+        curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences}
+        curl -fsSL --compressed {params.metadata_url:q} --output {output.metadata}
         """
 
-rule parse:
-    message: "Parsing fasta into sequences and metadata"
+rule decompress:
+    message: "Decompressing sequences and metadata"
     input:
-        sequences = rules.download.output.sequences
+        sequences = "data/sequences.fasta.xz",
+        metadata = "data/metadata.tsv.gz"
     output:
-        sequences = "results/sequences.fasta",
-        metadata = "results/metadata.tsv"
-    params:
-        fasta_fields = "strain virus accession date region country division city db segment authors url title journal paper_url",
-        prettify_fields = "region country division city"
+        sequences = "data/sequences.fasta",
+        metadata = "data/metadata.tsv"
     shell:
         """
-        augur parse \
-            --sequences {input.sequences} \
-            --output-sequences {output.sequences} \
-            --output-metadata {output.metadata} \
-            --fields {params.fasta_fields} \
-            --prettify-fields {params.prettify_fields}
+        gzip --decompress --keep {input.metadata}
+        xz --decompress --keep {input.sequences}
         """
 
 rule filter:
@@ -59,8 +50,8 @@ rule filter:
           - minimum genome length of {params.min_length}
         """
     input:
-        sequences = rules.parse.output.sequences,
-        metadata = rules.parse.output.metadata,
+        sequences = rules.decompress.output.sequences,
+        metadata = rules.decompress.output.metadata,
         exclude = files.dropped_strains
     output:
         sequences = "results/filtered.fasta"
@@ -128,7 +119,7 @@ rule refine:
     input:
         tree = rules.tree.output.tree,
         alignment = rules.align.output,
-        metadata = rules.parse.output.metadata
+        metadata = rules.decompress.output.metadata
     output:
         tree = "results/tree.nwk",
         node_data = "results/branch_lengths.json"
@@ -190,7 +181,7 @@ rule export:
     message: "Exporting data files for for auspice"
     input:
         tree = rules.refine.output.tree,
-        metadata = rules.parse.output.metadata,
+        metadata = rules.decompress.output.metadata,
         branch_lengths = rules.refine.output.node_data,
         nt_muts = rules.ancestral.output.node_data,
         aa_muts = rules.translate.output.node_data,

diff --git a/example_data/metadata.tsv.gz b/example_data/metadata.tsv.gz
diff --git a/example_data/sequences.fasta.xz b/example_data/sequences.fasta.xz