diff --git a/.travis.yml b/.travis.yml index 013f76e..121022c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,11 +7,24 @@ python: before_install: - python3 -m pip install --upgrade pip setuptools wheel install: - - pip3 install git+https://github.com/nextstrain/cli + # https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/use-conda-with-travis-ci.html#the-travis-yml-file + - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; + - bash miniconda.sh -b -p $HOME/miniconda + - source "$HOME/miniconda/etc/profile.d/conda.sh" + - hash -r + - conda config --set always_yes yes --set changeps1 no + - conda update -q conda + # Useful for debugging any issues with conda + - conda info -a + # Install nextstrain cli + - conda install -n base -c conda-forge mamba --yes + - conda activate base + - mamba create -n nextstrain -c conda-forge -c bioconda nextstrain-cli augur auspice nextalign snakemake git --yes + - conda activate nextstrain - nextstrain version - nextstrain check-setup - nextstrain update script: - mkdir -p data/ - - cp -v example_data/measles.fasta data/ + - cp -v example_data/* data/. - nextstrain build . diff --git a/README.md b/README.md index f74625c..e930b31 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ This is the [Nextstrain](https://nextstrain.org) build for measles virus, visibl The build encompasses fetching data, preparing it for analysis, doing quality control, performing analyses, and saving the results in a format suitable for visualization (with [auspice][]). This involves running components of -Nextstrain such as [fauna][] and [augur][]. +Nextstrain such as [augur][]. All measles-specific steps and functionality for the Nextstrain pipeline should be housed in this repository. @@ -42,22 +42,23 @@ Configuration takes place entirely with the `Snakefile`. This can be read top-to specifies its file inputs and output and also its parameters. There is little redirection and each rule should be able to be reasoned with on its own. - + -If you don't have access to our database, you can run the build using the +If you don't have access to our https endpoints, you can run the build using the example data provided in this repository. Before running the build, copy the example sequences into the `data/` directory like so: mkdir -p data/ - cp example_data/measles.fasta data/ + cp example_data/* data/. [Nextstrain]: https://nextstrain.org -[fauna]: https://github.com/nextstrain/fauna + [augur]: https://github.com/nextstrain/augur [auspice]: https://github.com/nextstrain/auspice [snakemake cli]: https://snakemake.readthedocs.io/en/stable/executable.html#all-options diff --git a/Snakefile b/Snakefile index 621b990..87b9b6d 100644 --- a/Snakefile +++ b/Snakefile @@ -13,40 +13,31 @@ rule files: files = rules.files.params rule download: - message: "Downloading sequences from fauna" + message: "Downloading sequences and metadata from data.nextstrain.org" output: - sequences = "data/measles.fasta" + sequences = "data/sequences.fasta.xz", + metadata = "data/metadata.tsv.gz" params: - fasta_fields = "strain virus accession collection_date region country division location source locus authors url title journal puburl" + sequences_url = "https://data.nextstrain.org/files/measles/sequences.fasta.xz", + metadata_url = "https://data.nextstrain.org/files/measles/metadata.tsv.gz" shell: """ - python3 ../fauna/vdb/download.py \ - --database vdb \ - --virus measles \ - --fasta_fields {params.fasta_fields} \ - --resolve_method choose_genbank \ - --path $(dirname {output.sequences}) \ - --fstem $(basename {output.sequences} .fasta) + curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences} + curl -fsSL --compressed {params.metadata_url:q} --output {output.metadata} """ -rule parse: - message: "Parsing fasta into sequences and metadata" +rule decompress: + message: "Decompressing sequences and metadata" input: - sequences = rules.download.output.sequences + sequences = "data/sequences.fasta.xz", + metadata = "data/metadata.tsv.gz" output: - sequences = "results/sequences.fasta", - metadata = "results/metadata.tsv" - params: - fasta_fields = "strain virus accession date region country division city db segment authors url title journal paper_url", - prettify_fields = "region country division city" + sequences = "data/sequences.fasta", + metadata = "data/metadata.tsv" shell: """ - augur parse \ - --sequences {input.sequences} \ - --output-sequences {output.sequences} \ - --output-metadata {output.metadata} \ - --fields {params.fasta_fields} \ - --prettify-fields {params.prettify_fields} + gzip --decompress --keep {input.metadata} + xz --decompress --keep {input.sequences} """ rule filter: @@ -59,8 +50,8 @@ rule filter: - minimum genome length of {params.min_length} """ input: - sequences = rules.parse.output.sequences, - metadata = rules.parse.output.metadata, + sequences = rules.decompress.output.sequences, + metadata = rules.decompress.output.metadata, exclude = files.dropped_strains output: sequences = "results/filtered.fasta" @@ -128,7 +119,7 @@ rule refine: input: tree = rules.tree.output.tree, alignment = rules.align.output, - metadata = rules.parse.output.metadata + metadata = rules.decompress.output.metadata output: tree = "results/tree.nwk", node_data = "results/branch_lengths.json" @@ -190,7 +181,7 @@ rule export: message: "Exporting data files for for auspice" input: tree = rules.refine.output.tree, - metadata = rules.parse.output.metadata, + metadata = rules.decompress.output.metadata, branch_lengths = rules.refine.output.node_data, nt_muts = rules.ancestral.output.node_data, aa_muts = rules.translate.output.node_data, diff --git a/example_data/metadata.tsv.gz b/example_data/metadata.tsv.gz new file mode 100644 index 0000000..044d947 Binary files /dev/null and b/example_data/metadata.tsv.gz differ diff --git a/example_data/sequences.fasta.xz b/example_data/sequences.fasta.xz new file mode 100644 index 0000000..e17c76c Binary files /dev/null and b/example_data/sequences.fasta.xz differ