From 6b9a688a30b1ea6908ffa8508aff7c8f1c6bf831 Mon Sep 17 00:00:00 2001 From: Kim Andrews <17375001+kimandrews@users.noreply.github.com> Date: Thu, 25 Apr 2024 17:09:46 -0700 Subject: [PATCH 1/8] Copy files from `phylogenetic` to `nextclade` Creates a nextclade directory and copies relevant files from phylogenetic directory to nextclade directory. Subsequent commits will change these for the Nextclade Dataset tree workflow. --- nextclade/Snakefile | 27 ++++++ nextclade/defaults/auspice_config.json | 55 ++++++++++++ nextclade/defaults/colors.tsv | 31 +++++++ nextclade/defaults/config.yaml | 29 ++++++ nextclade/defaults/dropped_strains.txt | 2 + nextclade/defaults/include_strains.txt | 40 +++++++++ .../defaults/measles_reference_N450.fasta | 8 ++ nextclade/defaults/measles_reference_N450.gb | 69 +++++++++++++++ nextclade/rules/annotate_phylogeny.smk | 41 +++++++++ nextclade/rules/construct_phylogeny.smk | 56 ++++++++++++ nextclade/rules/export.smk | 35 ++++++++ nextclade/rules/prepare_sequences.smk | 88 +++++++++++++++++++ 12 files changed, 481 insertions(+) create mode 100644 nextclade/Snakefile create mode 100644 nextclade/defaults/auspice_config.json create mode 100644 nextclade/defaults/colors.tsv create mode 100644 nextclade/defaults/config.yaml create mode 100644 nextclade/defaults/dropped_strains.txt create mode 100644 nextclade/defaults/include_strains.txt create mode 100644 nextclade/defaults/measles_reference_N450.fasta create mode 100644 nextclade/defaults/measles_reference_N450.gb create mode 100644 nextclade/rules/annotate_phylogeny.smk create mode 100644 nextclade/rules/construct_phylogeny.smk create mode 100644 nextclade/rules/export.smk create mode 100644 nextclade/rules/prepare_sequences.smk diff --git a/nextclade/Snakefile b/nextclade/Snakefile new file mode 100644 index 0000000..b4521ff --- /dev/null +++ b/nextclade/Snakefile @@ -0,0 +1,27 @@ +genes = ['N450', 'genome'] + +configfile: "defaults/config.yaml" + +rule all: + input: + auspice_json = expand("auspice/measles_{gene}.json", gene=genes) + +include: "rules/prepare_sequences.smk" +include: "rules/prepare_sequences_N450.smk" +include: "rules/construct_phylogeny.smk" +include: "rules/annotate_phylogeny.smk" +include: "rules/export.smk" + +# Include custom rules defined in the config. +if "custom_rules" in config: + for rule_file in config["custom_rules"]: + + include: rule_file + +rule clean: + """Removing directories: {params}""" + params: + "results ", + "auspice" + shell: + "rm -rfv {params}" diff --git a/nextclade/defaults/auspice_config.json b/nextclade/defaults/auspice_config.json new file mode 100644 index 0000000..d0e92c6 --- /dev/null +++ b/nextclade/defaults/auspice_config.json @@ -0,0 +1,55 @@ +{ + "title": "Real-time tracking of measles virus evolution", + "maintainers": [ + {"name": "Kim Andrews", "url": "https://bedford.io/team/kim-andrews/"}, + {"name": "the Nextstrain team", "url": "https://nextstrain.org/team"} + ], + "build_url": "https://github.com/nextstrain/measles", + "colorings": [ + { + "key": "gt", + "title": "Genotype", + "type": "categorical" + }, + { + "key": "num_date", + "title": "Date", + "type": "continuous" + }, + { + "key": "country", + "title": "Country", + "type": "categorical" + }, + { + "key": "region", + "title": "Region", + "type": "categorical" + }, + { + "key": "genotype_ncbi", + "title": "Genotype (NCBI)", + "type": "categorical" + }, + { + "key": "is_reference", + "title": "WHO reference", + "type": "categorical" + } + ], + "geo_resolutions": [ + "country", + "region" + ], + "display_defaults": { + "map_triplicate": true + }, + "filters": [ + "country", + "region", + "author" + ], + "metadata_columns": [ + "author" + ] +} diff --git a/nextclade/defaults/colors.tsv b/nextclade/defaults/colors.tsv new file mode 100644 index 0000000..548f45a --- /dev/null +++ b/nextclade/defaults/colors.tsv @@ -0,0 +1,31 @@ +region Asia #447CCD +region Oceania #5EA9A1 +region Africa #8ABB6A +region Europe #BEBB48 +region South America #E29E39 +region North America #E2562B + +genotype_ncbi A #5E1D9D +genotype_ncbi B1 #4B26B1 +genotype_ncbi B2 #4138C3 +genotype_ncbi B3 #3F4FCC +genotype_ncbi C1 #4065CF +genotype_ncbi C2 #447ACD +genotype_ncbi D1 #4A8BC3 +genotype_ncbi D2 #529AB6 +genotype_ncbi D3 #5BA6A6 +genotype_ncbi D4 #66AE95 +genotype_ncbi D5 #73B583 +genotype_ncbi D6 #81B973 +genotype_ncbi D7 #91BC64 +genotype_ncbi D8 #A1BE58 +genotype_ncbi D9 #B1BD4E +genotype_ncbi D10 #C0BA47 +genotype_ncbi D11 #CEB541 +genotype_ncbi E #DAAD3D +genotype_ncbi F #E19F3A +genotype_ncbi G1 #E68E36 +genotype_ncbi G2 #E67832 +genotype_ncbi G3 #E35F2D +genotype_ncbi H1 #DF4328 +genotype_ncbi H2 #DB2823 diff --git a/nextclade/defaults/config.yaml b/nextclade/defaults/config.yaml new file mode 100644 index 0000000..aa785ee --- /dev/null +++ b/nextclade/defaults/config.yaml @@ -0,0 +1,29 @@ +strain_id_field: "accession" +files: + exclude: "defaults/dropped_strains.txt" + include_genome: "defaults/include_strains_genome.txt" + include_N450: "defaults/include_strains_N450.txt" + reference: "defaults/measles_reference.gb" + reference_N450: "defaults/measles_reference_N450.gb" + reference_N450_fasta: "defaults/measles_reference_N450.fasta" + colors: "defaults/colors.tsv" + auspice_config: "defaults/auspice_config.json" + auspice_config_N450: "defaults/auspice_config_N450.json" +filter: + group_by: "country year" + sequences_per_group: 20 + min_date: 1950 + min_length: 5000 +filter_N450: + group_by: "country year" + subsample_max_sequences: 3000 + min_date: 1950 + min_length: 400 +refine: + coalescent: "opt" + date_inference: "marginal" + clock_filter_iqd: 4 +ancestral: + inference: "joint" +export: + metadata_columns: "strain division location" diff --git a/nextclade/defaults/dropped_strains.txt b/nextclade/defaults/dropped_strains.txt new file mode 100644 index 0000000..7983808 --- /dev/null +++ b/nextclade/defaults/dropped_strains.txt @@ -0,0 +1,2 @@ +HM562901 # temara.MOR/24.03 +HM562900 # Mvs/Toulon.FRA/08.07 \ No newline at end of file diff --git a/nextclade/defaults/include_strains.txt b/nextclade/defaults/include_strains.txt new file mode 100644 index 0000000..8e49e3d --- /dev/null +++ b/nextclade/defaults/include_strains.txt @@ -0,0 +1,40 @@ +# Vaccine strain information from Parks et al. Comparison of predicted amino acid +# sequences of measles virus strains in the Edmonston vaccine lineage +# https://doi.org/10.1128/jvi.75.2.910-920.2001 +AF266288 +AF266287 +AF266290 +AF266289 +AF266291 +AF266286 +# +# WHO genotype reference strains +# Information from https://www.who.int/publications/i/item/WER8709 +AF045212 +AF045217 +AF079555 +AF171232 +AF243450 +AF280803 +AF481485 +AJ232203 +AY037020 +AY043459 +AY184217 +AY923185 +D01005 +GU440571 +L46750 +L46753 +L46758 +M89921 +U01974 +U01976 +U01977 +U01987 +U01994 +U01998 +U64582 +X84865 +X84872 +X84879 diff --git a/nextclade/defaults/measles_reference_N450.fasta b/nextclade/defaults/measles_reference_N450.fasta new file mode 100644 index 0000000..eab1954 --- /dev/null +++ b/nextclade/defaults/measles_reference_N450.fasta @@ -0,0 +1,8 @@ +>lcl|NC_001498.1_cds_NP_056918.1_1 [gene=N] [locus_tag=MeVgp1] [db_xref=GeneID:1489804] [protein=nucleocapsid protein] [protein_id=NP_056918.1] [location=1233..1682] [gbkey=CDS] +GTCAGTTCCACATTGGCATCCGAACTCGGTATCACTGCCGAGGATGCAAGGCTTGTTTCAGAGAT +TGCAATGCATACTACTGAGGACAGGATCAGTAGAGCGGTCGGACCCAGACAAGCCCAAGTGTCATTTCTA +CACGGTGATCAAAGTGAGAATGAGCTACCAGGATTGGGGGGCAAGGAAGATAGGAGGGTCAAACAGGGTC +GGGGAGAAGCCAGGGAGAGCTACAGAGAAACCGGGTCCAGCAGAGCAAGTGATGCGAGAGCTGCCCATCC +TCCAACCAGCATGCCCCTAGACATTGACACTGCATCGGAGTCAGGCCAAGATCCGCAGGACAGTCGAAGG +TCAGCTGACGCCCTGCTCAGGCTGCAAGCCATGGCAGGAATCTTGGAAGAACAAGGCTCAGACACGGACA +CCCCTAGGGTATACAATGACAGAGATCTTCTAGAC diff --git a/nextclade/defaults/measles_reference_N450.gb b/nextclade/defaults/measles_reference_N450.gb new file mode 100644 index 0000000..277c96a --- /dev/null +++ b/nextclade/defaults/measles_reference_N450.gb @@ -0,0 +1,69 @@ +LOCUS NC_001498 450 bp cRNA linear VRL 13-AUG-2018 +DEFINITION Measles virus, complete genome. +ACCESSION NC_001498 REGION: 1233..1682 +VERSION NC_001498.1 +DBLINK Project: 15025 + BioProject: PRJNA485481 +KEYWORDS RefSeq. +SOURCE Measles morbillivirus + ORGANISM Measles morbillivirus + Viruses; Riboviria; Orthornavirae; Negarnaviricota; + Haploviricotina; Monjiviricetes; Mononegavirales; Paramyxoviridae; + Orthoparamyxovirinae; Morbillivirus; Morbillivirus hominis. +REFERENCE 1 (sites) + AUTHORS Rima,B.K. and Duprex,W.P. + TITLE The measles virus replication cycle + JOURNAL Curr. Top. Microbiol. Immunol. 329, 77-102 (2009) + PUBMED 19198563 +REFERENCE 2 + AUTHORS Takeuchi,K., Miyajima,N., Kobune,F. and Tashiro,M. + TITLE Comparative nucleotide sequence analyses of the entire genomes of + B95a cell-isolated and vero cell-isolated measles viruses from the + same patient + JOURNAL Virus Genes 20 (3), 253-257 (2000) + PUBMED 10949953 +REFERENCE 3 (bases 1 to 450) + CONSRTM NCBI Genome Project + TITLE Direct Submission + JOURNAL Submitted (01-AUG-2000) National Center for Biotechnology + Information, NIH, Bethesda, MD 20894, USA +REFERENCE 4 (bases 1 to 450) + AUTHORS Takeuchi,K., Tanabayashi,K. and Tashiro,M. + TITLE Direct Submission + JOURNAL Submitted (10-JUL-1998) Kaoru Takeuchi, National Institute of + Infectious Diseases, Viral Disease and Vaccine Contorol; 4-7-1 + Gakuen, Musashi-murayama, Tokyo 208-0011, Japan + (E-mail:ktake@nih.go.jp, Tel:81-42-561-0771(ex.530), + Fax:81-42-567-5631) +COMMENT REVIEWED REFSEQ: This record has been curated by NCBI staff. The + reference sequence was derived from AB016162. + Sequence updated (21-Jul-1998) + Sequence updated (11-Dec-1998). + COMPLETENESS: full length. +FEATURES Location/Qualifiers + source 1..450 + /organism="Measles morbillivirus" + /mol_type="viral cRNA" + /strain="Ichinose-B95a" + /db_xref="taxon:11234" + CDS <1..>450 + /gene="N" + /codon_start=1 + /product="nucleocapsid protein" + /protein_id="NP_056918.1" + /db_xref="GeneID:1489804" + /translation="VSSTLASELGITAEDAR + LVSEIAMHTTEDRISRAVGPRQAQVSFLHGDQSENELPGLGGKEDRRVKQGRGEARES + YRETGSSRASDARAAHPPTSMPLDIDTASESGQDPQDSRRSADALLRLQAMAGILEEQ + GSDTDTPRVYNDRDLLD" +ORIGIN + 1 gtcagttcca cattggcatc cgaactcggt atcactgccg aggatgcaag gcttgtttca + 61 gagattgcaa tgcatactac tgaggacagg atcagtagag cggtcggacc cagacaagcc + 121 caagtgtcat ttctacacgg tgatcaaagt gagaatgagc taccaggatt ggggggcaag + 181 gaagatagga gggtcaaaca gggtcgggga gaagccaggg agagctacag agaaaccggg + 241 tccagcagag caagtgatgc gagagctgcc catcctccaa ccagcatgcc cctagacatt + 301 gacactgcat cggagtcagg ccaagatccg caggacagtc gaaggtcagc tgacgccctg + 361 ctcaggctgc aagccatggc aggaatcttg gaagaacaag gctcagacac ggacacccct + 421 agggtataca atgacagaga tcttctagac +// + diff --git a/nextclade/rules/annotate_phylogeny.smk b/nextclade/rules/annotate_phylogeny.smk new file mode 100644 index 0000000..61e94b8 --- /dev/null +++ b/nextclade/rules/annotate_phylogeny.smk @@ -0,0 +1,41 @@ +""" +This part of the workflow creates additonal annotations for the phylogenetic tree. + +See Augur's usage docs for these commands for more details. + +""" + +rule ancestral: + """Reconstructing ancestral sequences and mutations""" + input: + tree = "results/{gene}/tree.nwk", + alignment = "results/{gene}/aligned.fasta" + output: + node_data = "results/{gene}/nt_muts.json" + params: + inference = config["ancestral"]["inference"] + shell: + """ + augur ancestral \ + --tree {input.tree} \ + --alignment {input.alignment} \ + --output-node-data {output.node_data} \ + --inference {params.inference} + """ + +rule translate: + """Translating amino acid sequences""" + input: + tree = "results/{gene}/tree.nwk", + node_data = "results/{gene}/nt_muts.json", + reference = lambda wildcard: "defaults/measles_reference.gb" if wildcard.gene in ["genome"] else "defaults/measles_reference_{gene}.gb" + output: + node_data = "results/{gene}/aa_muts.json" + shell: + """ + augur translate \ + --tree {input.tree} \ + --ancestral-sequences {input.node_data} \ + --reference-sequence {input.reference} \ + --output {output.node_data} \ + """ diff --git a/nextclade/rules/construct_phylogeny.smk b/nextclade/rules/construct_phylogeny.smk new file mode 100644 index 0000000..e222800 --- /dev/null +++ b/nextclade/rules/construct_phylogeny.smk @@ -0,0 +1,56 @@ +""" +This part of the workflow constructs the phylogenetic tree. + +See Augur's usage docs for these commands for more details. +""" + +rule tree: + """Building tree""" + input: + alignment = "results/{gene}/aligned.fasta" + output: + tree = "results/{gene}/tree_raw.nwk" + shell: + """ + augur tree \ + --alignment {input.alignment} \ + --output {output.tree} + """ + +rule refine: + """ + Refining tree + - estimate timetree + - use {params.coalescent} coalescent timescale + - estimate {params.date_inference} node dates + - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation + """ + input: + tree = "results/{gene}/tree_raw.nwk", + alignment = "results/{gene}/aligned.fasta", + metadata = "data/metadata.tsv" + output: + tree = "results/{gene}/tree.nwk", + node_data = "results/{gene}/branch_lengths.json" + params: + coalescent = config["refine"]["coalescent"], + date_inference = config["refine"]["date_inference"], + clock_filter_iqd = config["refine"]["clock_filter_iqd"], + strain_id = config["strain_id_field"] + shell: + """ + augur refine \ + --tree {input.tree} \ + --alignment {input.alignment} \ + --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ + --output-tree {output.tree} \ + --output-node-data {output.node_data} \ + --timetree \ + --coalescent {params.coalescent} \ + --date-confidence \ + --date-inference {params.date_inference} \ + --clock-filter-iqd {params.clock_filter_iqd} \ + --stochastic-resolve + """ + \ No newline at end of file diff --git a/nextclade/rules/export.smk b/nextclade/rules/export.smk new file mode 100644 index 0000000..96dfed5 --- /dev/null +++ b/nextclade/rules/export.smk @@ -0,0 +1,35 @@ +""" +This part of the workflow collects the phylogenetic tree and annotations to +export a Nextstrain dataset. + +See Augur's usage docs for these commands for more details. +""" + +rule export: + """Exporting data files for for auspice""" + input: + tree = "results/{gene}/tree.nwk", + metadata = "data/metadata.tsv", + branch_lengths = "results/{gene}/branch_lengths.json", + nt_muts = "results/{gene}/nt_muts.json", + aa_muts = "results/{gene}/aa_muts.json", + colors = config["files"]["colors"], + auspice_config = lambda wildcard: "defaults/auspice_config.json" if wildcard.gene in ["genome"] else "defaults/auspice_config_N450.json" + output: + auspice_json = "auspice/measles_{gene}.json" + params: + strain_id = config["strain_id_field"], + metadata_columns = config["export"]["metadata_columns"] + shell: + """ + augur export v2 \ + --tree {input.tree} \ + --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ + --node-data {input.branch_lengths} {input.nt_muts} {input.aa_muts} \ + --colors {input.colors} \ + --metadata-columns {params.metadata_columns} \ + --auspice-config {input.auspice_config} \ + --include-root-sequence-inline \ + --output {output.auspice_json} + """ diff --git a/nextclade/rules/prepare_sequences.smk b/nextclade/rules/prepare_sequences.smk new file mode 100644 index 0000000..508fcc7 --- /dev/null +++ b/nextclade/rules/prepare_sequences.smk @@ -0,0 +1,88 @@ +""" +This part of the workflow prepares sequences for constructing the phylogenetic tree. + +See Augur's usage docs for these commands for more details. +""" +rule download: + """Downloading sequences and metadata from data.nextstrain.org""" + output: + sequences = "data/sequences.fasta.zst", + metadata = "data/metadata.tsv.zst" + params: + sequences_url = "https://data.nextstrain.org/files/workflows/measles/sequences.fasta.zst", + metadata_url = "https://data.nextstrain.org/files/workflows/measles/metadata.tsv.zst" + shell: + """ + curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences} + curl -fsSL --compressed {params.metadata_url:q} --output {output.metadata} + """ + +rule decompress: + """Decompressing sequences and metadata""" + input: + sequences = "data/sequences.fasta.zst", + metadata = "data/metadata.tsv.zst" + output: + sequences = "data/sequences.fasta", + metadata = "data/metadata.tsv" + shell: + """ + zstd -d -c {input.sequences} > {output.sequences} + zstd -d -c {input.metadata} > {output.metadata} + """ + +rule filter: + """ + Filtering to + - {params.sequences_per_group} sequence(s) per {params.group_by!s} + - from {params.min_date} onwards + - excluding strains in {input.exclude} + - minimum genome length of {params.min_length} + """ + input: + sequences = "data/sequences.fasta", + metadata = "data/metadata.tsv", + exclude = config["files"]["exclude"], + include = config["files"]["include_genome"] + output: + sequences = "results/genome/filtered.fasta" + params: + group_by = config["filter"]["group_by"], + sequences_per_group = config["filter"]["sequences_per_group"], + min_date = config["filter"]["min_date"], + min_length = config["filter"]["min_length"], + strain_id = config["strain_id_field"] + shell: + """ + augur filter \ + --sequences {input.sequences} \ + --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ + --exclude {input.exclude} \ + --output {output.sequences} \ + --group-by {params.group_by} \ + --sequences-per-group {params.sequences_per_group} \ + --min-date {params.min_date} \ + --min-length {params.min_length} + """ + +rule align: + """ + Aligning sequences to {input.reference} + - filling gaps with N + """ + input: + sequences = "results/genome/filtered.fasta", + reference = config["files"]["reference"] + output: + alignment = "results/genome/aligned.fasta" + shell: + """ + augur align \ + --sequences {input.sequences} \ + --reference-sequence {input.reference} \ + --output {output.alignment} \ + --fill-gaps \ + --remove-reference + """ + \ No newline at end of file From 449aec00309d72020956292560315434b810f661 Mon Sep 17 00:00:00 2001 From: Kim Andrews <17375001+kimandrews@users.noreply.github.com> Date: Sat, 27 Apr 2024 15:53:34 -0700 Subject: [PATCH 2/8] Prepare sequences for Nextclade Dataset tree * Add rule to align sequences to N450 reference using Nextclade, following https://github.com/nextstrain/measles/pull/20/commits/edbefd52b3222ba92cd7fb6b9bbf6f9d3b72a26a * Filter to produce a sample set appropriate for a Nextclade Dataset tree --- nextclade/Snakefile | 1 - nextclade/defaults/config.yaml | 17 +++---- nextclade/defaults/dropped_strains.txt | 31 ++++++++++++- nextclade/defaults/include_strains.txt | 13 ++++++ nextclade/rules/prepare_sequences.smk | 64 ++++++++++++-------------- 5 files changed, 79 insertions(+), 47 deletions(-) diff --git a/nextclade/Snakefile b/nextclade/Snakefile index b4521ff..1bcbf15 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -7,7 +7,6 @@ rule all: auspice_json = expand("auspice/measles_{gene}.json", gene=genes) include: "rules/prepare_sequences.smk" -include: "rules/prepare_sequences_N450.smk" include: "rules/construct_phylogeny.smk" include: "rules/annotate_phylogeny.smk" include: "rules/export.smk" diff --git a/nextclade/defaults/config.yaml b/nextclade/defaults/config.yaml index aa785ee..bb67af5 100644 --- a/nextclade/defaults/config.yaml +++ b/nextclade/defaults/config.yaml @@ -1,22 +1,17 @@ strain_id_field: "accession" files: exclude: "defaults/dropped_strains.txt" - include_genome: "defaults/include_strains_genome.txt" - include_N450: "defaults/include_strains_N450.txt" - reference: "defaults/measles_reference.gb" + include: "defaults/include_strains.txt" reference_N450: "defaults/measles_reference_N450.gb" reference_N450_fasta: "defaults/measles_reference_N450.fasta" colors: "defaults/colors.tsv" auspice_config: "defaults/auspice_config.json" - auspice_config_N450: "defaults/auspice_config_N450.json" +align_and_extract_N450: + min_length: 400 + min_seed_cover: 0.01 filter: - group_by: "country year" - sequences_per_group: 20 - min_date: 1950 - min_length: 5000 -filter_N450: - group_by: "country year" - subsample_max_sequences: 3000 + group_by: "region genotype_ncbi year" + subsample_max_sequences: 500 min_date: 1950 min_length: 400 refine: diff --git a/nextclade/defaults/dropped_strains.txt b/nextclade/defaults/dropped_strains.txt index 7983808..e48b51d 100644 --- a/nextclade/defaults/dropped_strains.txt +++ b/nextclade/defaults/dropped_strains.txt @@ -1,2 +1,31 @@ HM562901 # temara.MOR/24.03 -HM562900 # Mvs/Toulon.FRA/08.07 \ No newline at end of file +HM562900 # Mvs/Toulon.FRA/08.07 +# +# Incorrect genotypes reported on NCBI: +KY941950 +KX610825 +KX603680 +KX455930 +KX420624 +KX024471 +KP856709 +KP734120 +KP734117 +KP056769 +KM017095 +KJ556875 +KJ556859 +KC139079 +JX556685 +JX556684 +JN005809 +GU937234 +AF410987 +# +# Genotype assignment is ambiguous based on tree structure: +KY678430 +AB453044 +# +# Clock rate outliers: +AB573812 +HM562905 diff --git a/nextclade/defaults/include_strains.txt b/nextclade/defaults/include_strains.txt index 8e49e3d..57c14f0 100644 --- a/nextclade/defaults/include_strains.txt +++ b/nextclade/defaults/include_strains.txt @@ -38,3 +38,16 @@ U64582 X84865 X84872 X84879 +# +# Rare genotypes +# Including these to boost representation of these genotypes in the nextclade tree +AF410989 #Rare genotype: E +MG912591 #Rare genotype: G2 +AY037009 #Rare genotype: G2 +AY037043 #Rare genotype: H2 +AY037026 #Rare genotype: H2 +AY037028 #Rare genotype: D2 +FJ668380 #Rare genotype: D10 +MN017369 #Rare genotype: D11 +KC968467 #Rare genotype: D11 +KC968354 #Rare genotype: D11 diff --git a/nextclade/rules/prepare_sequences.smk b/nextclade/rules/prepare_sequences.smk index 508fcc7..428a0de 100644 --- a/nextclade/rules/prepare_sequences.smk +++ b/nextclade/rules/prepare_sequences.smk @@ -31,24 +31,39 @@ rule decompress: zstd -d -c {input.metadata} > {output.metadata} """ -rule filter: - """ - Filtering to - - {params.sequences_per_group} sequence(s) per {params.group_by!s} - - from {params.min_date} onwards - - excluding strains in {input.exclude} - - minimum genome length of {params.min_length} - """ +rule align_and_extract_N450: input: sequences = "data/sequences.fasta", + reference = config["files"]["reference_N450_fasta"] + output: + sequences = "results/sequences_N450.fasta" + params: + min_seed_cover = config['align_and_extract_N450']['min_seed_cover'], + min_length = config['align_and_extract_N450']['min_length'] + threads: workflow.cores + shell: + """ + nextclade3 run \ + --jobs {threads} \ + --input-ref {input.reference} \ + --output-fasta {output.sequences} \ + --min-seed-cover {params.min_seed_cover} \ + --min-length {params.min_length} \ + --silent \ + {input.sequences} + """ + +rule filter: + input: + sequences = "results/sequences_N450.fasta", metadata = "data/metadata.tsv", exclude = config["files"]["exclude"], - include = config["files"]["include_genome"] + include = config["files"]["include"] output: - sequences = "results/genome/filtered.fasta" + sequences = "results/aligned.fasta" params: group_by = config["filter"]["group_by"], - sequences_per_group = config["filter"]["sequences_per_group"], + subsample_max_sequences = config["filter"]["subsample_max_sequences"], min_date = config["filter"]["min_date"], min_length = config["filter"]["min_length"], strain_id = config["strain_id_field"] @@ -59,30 +74,11 @@ rule filter: --metadata {input.metadata} \ --metadata-id-columns {params.strain_id} \ --exclude {input.exclude} \ + --include {input.include} \ --output {output.sequences} \ --group-by {params.group_by} \ - --sequences-per-group {params.sequences_per_group} \ + --subsample-max-sequences {params.subsample_max_sequences} \ --min-date {params.min_date} \ - --min-length {params.min_length} - """ - -rule align: - """ - Aligning sequences to {input.reference} - - filling gaps with N - """ - input: - sequences = "results/genome/filtered.fasta", - reference = config["files"]["reference"] - output: - alignment = "results/genome/aligned.fasta" - shell: - """ - augur align \ - --sequences {input.sequences} \ - --reference-sequence {input.reference} \ - --output {output.alignment} \ - --fill-gaps \ - --remove-reference + --min-length {params.min_length} \ + --query='genotype_ncbi!=""' """ - \ No newline at end of file From 5e76b3102735aa2fc1857aa16623dbc94c5acb50 Mon Sep 17 00:00:00 2001 From: Kim Andrews <17375001+kimandrews@users.noreply.github.com> Date: Sat, 27 Apr 2024 15:58:23 -0700 Subject: [PATCH 3/8] Construct phylogeny for Nextclade Dataset tree Remove construct_phylogeny.smk wildcards that were needed for the Phylogenetic workflow but are not needed for the Nextclade Dataset tree workflow. --- nextclade/rules/construct_phylogeny.smk | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/nextclade/rules/construct_phylogeny.smk b/nextclade/rules/construct_phylogeny.smk index e222800..5a34b9e 100644 --- a/nextclade/rules/construct_phylogeny.smk +++ b/nextclade/rules/construct_phylogeny.smk @@ -7,9 +7,9 @@ See Augur's usage docs for these commands for more details. rule tree: """Building tree""" input: - alignment = "results/{gene}/aligned.fasta" + alignment = "results/aligned.fasta" output: - tree = "results/{gene}/tree_raw.nwk" + tree = "results/tree_raw.nwk" shell: """ augur tree \ @@ -26,12 +26,12 @@ rule refine: - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation """ input: - tree = "results/{gene}/tree_raw.nwk", - alignment = "results/{gene}/aligned.fasta", + tree = "results/tree_raw.nwk", + alignment = "results/aligned.fasta", metadata = "data/metadata.tsv" output: - tree = "results/{gene}/tree.nwk", - node_data = "results/{gene}/branch_lengths.json" + tree = "results/tree.nwk", + node_data = "results/branch_lengths.json" params: coalescent = config["refine"]["coalescent"], date_inference = config["refine"]["date_inference"], From df0ba96f91581b1fb2629fc476a0ef3922f1f0dd Mon Sep 17 00:00:00 2001 From: Kim Andrews <17375001+kimandrews@users.noreply.github.com> Date: Sat, 27 Apr 2024 16:23:58 -0700 Subject: [PATCH 4/8] Annotate phylogeny for Nextclade Dataset tree * Use `augur ancestral` option `--root-sequence` to add mutations from the reference sequence to the root of the tree, which is required for Nextclade Dataset trees, as described here: https://github.com/nextstrain/nextclade_data/blob/577696e4ee0f0dff925d3911d27351a7395fab3d/docs/dataset-creation-guide.md * Use `augur clades` to assign genotypes to nodes * Remove annotate_phylogeny.smk wildcards that were needed for the Phylogenetic workflow but are not needed for the Nextclade Dataset tree workflow. --- nextclade/defaults/clades.tsv | 64 ++++++++++++++++++++++++++ nextclade/defaults/config.yaml | 1 + nextclade/rules/annotate_phylogeny.smk | 37 +++++++++++---- 3 files changed, 93 insertions(+), 9 deletions(-) create mode 100644 nextclade/defaults/clades.tsv diff --git a/nextclade/defaults/clades.tsv b/nextclade/defaults/clades.tsv new file mode 100644 index 0000000..2c4261b --- /dev/null +++ b/nextclade/defaults/clades.tsv @@ -0,0 +1,64 @@ +clade gene site alt +A nuc 89 A +A nuc 126 A +A nuc 422 T +A nuc 439 A +B1 nuc 36 G +B1 nuc 79 G +B1 nuc 111 T +B1 nuc 261 A +B1 nuc 279 G +B2 nuc 323 T +B3 nuc 351 C +C1 nuc 75 C +C1 nuc 246 T +C1 nuc 317 T +C1 nuc 354 A +C2 nuc 81 A +C2 nuc 118 T +D1 nuc 23 G +D1 nuc 45 C +D1 nuc 330 T +D10 nuc 123 G +D10 nuc 218 T +D10 nuc 323 G +D10 nuc 444 A +D11 nuc 54 A +D11 nuc 214 T +D2 nuc 207 G +D2 nuc 255 C +D2 nuc 15 A +D2 nuc 250 A +D3 nuc 21 C +D3 nuc 186 C +D3 nuc 275 C +D3 nuc 287 T +D4 nuc 133 A +D4 nuc 367 T +D4 nuc 416 T +D5 nuc 48 G +D5 nuc 105 C +D6 nuc 316 A +D7 nuc 339 C +D7 nuc 343 C +D8 nuc 226 A +D8 nuc 251 T +D9 nuc 216 A +E nuc 224 T +E nuc 329 A +E nuc 341 A +F nuc 222 A +F nuc 297 T +F nuc 332 T +G1 nuc 27 T +G1 nuc 177 G +G1 nuc 419 T +G2 nuc 242 A +G2 nuc 271 G +G3 nuc 3 A +H1 nuc 126 C +H1 nuc 224 A +H2 nuc 84 A +H2 nuc 171 A +H2 nuc 245 A +H2 nuc 387 T diff --git a/nextclade/defaults/config.yaml b/nextclade/defaults/config.yaml index bb67af5..41895a9 100644 --- a/nextclade/defaults/config.yaml +++ b/nextclade/defaults/config.yaml @@ -4,6 +4,7 @@ files: include: "defaults/include_strains.txt" reference_N450: "defaults/measles_reference_N450.gb" reference_N450_fasta: "defaults/measles_reference_N450.fasta" + clades: "defaults/clades.tsv" colors: "defaults/colors.tsv" auspice_config: "defaults/auspice_config.json" align_and_extract_N450: diff --git a/nextclade/rules/annotate_phylogeny.smk b/nextclade/rules/annotate_phylogeny.smk index 61e94b8..6ca5ec3 100644 --- a/nextclade/rules/annotate_phylogeny.smk +++ b/nextclade/rules/annotate_phylogeny.smk @@ -8,29 +8,31 @@ See Augur's usage docs for these commands for more details. rule ancestral: """Reconstructing ancestral sequences and mutations""" input: - tree = "results/{gene}/tree.nwk", - alignment = "results/{gene}/aligned.fasta" + tree = "results/tree.nwk", + alignment = "results/aligned.fasta" output: - node_data = "results/{gene}/nt_muts.json" + node_data = "results/nt_muts.json" params: - inference = config["ancestral"]["inference"] + inference = config["ancestral"]["inference"], + reference_fasta = config["files"]["reference_N450_fasta"] shell: """ augur ancestral \ --tree {input.tree} \ --alignment {input.alignment} \ --output-node-data {output.node_data} \ - --inference {params.inference} + --inference {params.inference} \ + --root-sequence {params.reference_fasta} """ rule translate: """Translating amino acid sequences""" input: - tree = "results/{gene}/tree.nwk", - node_data = "results/{gene}/nt_muts.json", - reference = lambda wildcard: "defaults/measles_reference.gb" if wildcard.gene in ["genome"] else "defaults/measles_reference_{gene}.gb" + tree = "results/tree.nwk", + node_data = "results/nt_muts.json", + reference = config["files"]["reference_N450"] output: - node_data = "results/{gene}/aa_muts.json" + node_data = "results/aa_muts.json" shell: """ augur translate \ @@ -39,3 +41,20 @@ rule translate: --reference-sequence {input.reference} \ --output {output.node_data} \ """ + +rule clades: + input: + tree = "results/tree.nwk", + nt_muts = "results/nt_muts.json", + aa_muts = "results/aa_muts.json", + clade_defs = config["files"]["clades"] + output: + clades = "results/clades.json" + shell: + """ + augur clades \ + --tree {input.tree} \ + --mutations {input.nt_muts} {input.aa_muts} \ + --clades {input.clade_defs} \ + --output {output.clades} + """ From df7c7f044378b96e928da981754a7072c37ecd8b Mon Sep 17 00:00:00 2001 From: Kim Andrews <17375001+kimandrews@users.noreply.github.com> Date: Sat, 27 Apr 2024 16:33:33 -0700 Subject: [PATCH 5/8] Export auspice json for Nextclade Dataset tree * Include `augur clades` genotype assignments in exported tree and in coloring options * Remove export.smk wildcards that were needed for the Phylogenetic workflow but are not needed for the Nextclade Dataset tree workflow. --- nextclade/Snakefile | 4 +--- nextclade/defaults/auspice_config.json | 5 +++++ nextclade/defaults/colors.tsv | 30 +++++++++++++++++++++++++- nextclade/rules/export.smk | 17 ++++++++------- phylogenetic/defaults/colors.tsv | 4 +++- 5 files changed, 47 insertions(+), 13 deletions(-) diff --git a/nextclade/Snakefile b/nextclade/Snakefile index 1bcbf15..dd876ff 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -1,10 +1,8 @@ -genes = ['N450', 'genome'] - configfile: "defaults/config.yaml" rule all: input: - auspice_json = expand("auspice/measles_{gene}.json", gene=genes) + auspice_json = "auspice/measles.json" include: "rules/prepare_sequences.smk" include: "rules/construct_phylogeny.smk" diff --git a/nextclade/defaults/auspice_config.json b/nextclade/defaults/auspice_config.json index d0e92c6..aa89017 100644 --- a/nextclade/defaults/auspice_config.json +++ b/nextclade/defaults/auspice_config.json @@ -35,6 +35,11 @@ "key": "is_reference", "title": "WHO reference", "type": "categorical" + }, + { + "key": "clade_membership", + "title": "Genotype (augur clades)", + "type": "categorical" } ], "geo_resolutions": [ diff --git a/nextclade/defaults/colors.tsv b/nextclade/defaults/colors.tsv index 548f45a..e9fa118 100644 --- a/nextclade/defaults/colors.tsv +++ b/nextclade/defaults/colors.tsv @@ -1,10 +1,12 @@ +# Regions: These fields are identical to those in phylogenetic/defaults/colors.tsv. Any changes to one should also be made to the other. region Asia #447CCD region Oceania #5EA9A1 region Africa #8ABB6A region Europe #BEBB48 region South America #E29E39 region North America #E2562B - +# +# MeV Genotypes reported in NCBI GenBank metadata: These fields are identical to those in phylogenetic/defaults/colors.tsv. Any changes to one should also be made to the other. genotype_ncbi A #5E1D9D genotype_ncbi B1 #4B26B1 genotype_ncbi B2 #4138C3 @@ -29,3 +31,29 @@ genotype_ncbi G2 #E67832 genotype_ncbi G3 #E35F2D genotype_ncbi H1 #DF4328 genotype_ncbi H2 #DB2823 +# +# MeV Genotypes assigned by augur clades +clade_membership A #5E1D9D +clade_membership B1 #4B26B1 +clade_membership B2 #4138C3 +clade_membership B3 #3F4FCC +clade_membership C1 #4065CF +clade_membership C2 #447ACD +clade_membership D1 #4A8BC3 +clade_membership D2 #529AB6 +clade_membership D3 #5BA6A6 +clade_membership D4 #66AE95 +clade_membership D5 #73B583 +clade_membership D6 #81B973 +clade_membership D7 #91BC64 +clade_membership D8 #A1BE58 +clade_membership D9 #B1BD4E +clade_membership D10 #C0BA47 +clade_membership D11 #CEB541 +clade_membership E #DAAD3D +clade_membership F #E19F3A +clade_membership G1 #E68E36 +clade_membership G2 #E67832 +clade_membership G3 #E35F2D +clade_membership H1 #DF4328 +clade_membership H2 #DB2823 diff --git a/nextclade/rules/export.smk b/nextclade/rules/export.smk index 96dfed5..45ac446 100644 --- a/nextclade/rules/export.smk +++ b/nextclade/rules/export.smk @@ -6,17 +6,18 @@ See Augur's usage docs for these commands for more details. """ rule export: - """Exporting data files for for auspice""" + """Exporting data files for auspice""" input: - tree = "results/{gene}/tree.nwk", + tree = "results/tree.nwk", metadata = "data/metadata.tsv", - branch_lengths = "results/{gene}/branch_lengths.json", - nt_muts = "results/{gene}/nt_muts.json", - aa_muts = "results/{gene}/aa_muts.json", + branch_lengths = "results/branch_lengths.json", + clades = "results/clades.json", + nt_muts = "results/nt_muts.json", + aa_muts = "results/aa_muts.json", colors = config["files"]["colors"], - auspice_config = lambda wildcard: "defaults/auspice_config.json" if wildcard.gene in ["genome"] else "defaults/auspice_config_N450.json" + auspice_config = config["files"]["auspice_config"] output: - auspice_json = "auspice/measles_{gene}.json" + auspice_json = "auspice/measles.json" params: strain_id = config["strain_id_field"], metadata_columns = config["export"]["metadata_columns"] @@ -26,7 +27,7 @@ rule export: --tree {input.tree} \ --metadata {input.metadata} \ --metadata-id-columns {params.strain_id} \ - --node-data {input.branch_lengths} {input.nt_muts} {input.aa_muts} \ + --node-data {input.branch_lengths} {input.nt_muts} {input.aa_muts} {input.clades} \ --colors {input.colors} \ --metadata-columns {params.metadata_columns} \ --auspice-config {input.auspice_config} \ diff --git a/phylogenetic/defaults/colors.tsv b/phylogenetic/defaults/colors.tsv index 548f45a..8036510 100644 --- a/phylogenetic/defaults/colors.tsv +++ b/phylogenetic/defaults/colors.tsv @@ -1,10 +1,12 @@ +# Regions: These fields are identical to those in nextclade/defaults/colors.tsv. Any changes to one should also be made to the other. region Asia #447CCD region Oceania #5EA9A1 region Africa #8ABB6A region Europe #BEBB48 region South America #E29E39 region North America #E2562B - +# +# MeV Genotypes reported in NCBI GenBank metadata: These fields are identical to those in nextclade/defaults/colors.tsv. Any changes to one should also be made to the other. genotype_ncbi A #5E1D9D genotype_ncbi B1 #4B26B1 genotype_ncbi B2 #4138C3 From ba4c9be8b62b6be9dff60fba3226d827aa1a54e4 Mon Sep 17 00:00:00 2001 From: Kim Andrews <17375001+kimandrews@users.noreply.github.com> Date: Tue, 30 Apr 2024 13:58:42 -0700 Subject: [PATCH 6/8] Add README.md --- nextclade/README.md | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 nextclade/README.md diff --git a/nextclade/README.md b/nextclade/README.md new file mode 100644 index 0000000..79bfa4f --- /dev/null +++ b/nextclade/README.md @@ -0,0 +1,36 @@ + +# Measles Nextclade Dataset Tree + +This workflow creates a phylogenetic tree that can be used as part of a Nextclade dataset to assign genotypes to measles samples based on [criteria outlined by the WHO](https://www.who.int/publications/i/item/WER8709). + +The WHO has defined 24 measles genotypes based on N gene and H gene sequences from 28 reference strains. For new measles samples, genotypes can be assigned based on genetic similarity to the reference strains at the "N450" region (a 450 bp region of the N gene). + +The tree created here includes N450 sequences for the 28 reference strains, along with other representative strains for each genotype. + +The workflow includes the following steps: +* Build a tree using samples from the `ingest` output, with the following sampling criteria: + * Exclude samples for which a genotype is NOT present on NCBI (indicated in the metadata column "genotype_ncbi") + * Force-include the following samples: + * WHO genotype reference strains + * Vaccine strains + * All available samples for genotypes that are poorly represented on NCBI (i.e., genotypes that have fewer than 10 samples on NCBI) + * Subsampling criteria: + * group_by: "region genotype_ncbi year" + * subsample_max_sequences: 500 + * min_date: 1950 + * min_length: 400 +* Assign genotypes to each sample and internal nodes of the tree with `augur clades`, using clade-defining mutations in `defaults/clades.tsv` +* Provide the following coloring options on the tree: + * WHO reference strains ("True" or "False") + * Genotype assignment from `augur clades` + * Genotype assignment reported on NCBI + +## How to create a new tree: +* Run the workflow: `nextstrain build .` +* Inspect the output tree by comparing genotype assignments from the following sources: + * WHO reference strains + * `augur clades` output + * NCBI Datasets output +* If unwanted samples are present in the tree, add them to `defaults/dropped_strains.tsv` and re-run the workflow +* If any changes are needed to the clade-defining mutations, add changes to `defaults/clades.tsv` and re-run the workflow +* Repeat as needed From 07bf7378455a706ccd34b2055988c8e1730df50f Mon Sep 17 00:00:00 2001 From: Trevor Bedford Date: Wed, 1 May 2024 16:30:04 -0700 Subject: [PATCH 7/8] Update Auspice config for Nextclade tree This makes a few various updates to the Auspice config for the Nextclade tree including: 1. Making "clade_membership" default coloring. For this tree, we really care about MeV Genotype more than other colorings. 2. Call this "MeV Genotype (Nextstrain)". I think important to specify MeV Genotype to distinguish from the more common SNP-level genotype. 3. Clarify with "MeV Genotype (GenBank metadata)". The previous "(NCBI)" seemed like this could be a MeV Genotype call from an NCBI tool. 4. Order as Region then Country to follow ncov 5. Include clade_membership as filter option --- nextclade/defaults/auspice_config.json | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/nextclade/defaults/auspice_config.json b/nextclade/defaults/auspice_config.json index aa89017..07109d8 100644 --- a/nextclade/defaults/auspice_config.json +++ b/nextclade/defaults/auspice_config.json @@ -17,8 +17,8 @@ "type": "continuous" }, { - "key": "country", - "title": "Country", + "key": "clade_membership", + "title": "MeV Genotype (Nextstrain)", "type": "categorical" }, { @@ -26,20 +26,20 @@ "title": "Region", "type": "categorical" }, + { + "key": "country", + "title": "Country", + "type": "categorical" + }, { "key": "genotype_ncbi", - "title": "Genotype (NCBI)", + "title": "MeV Genotype (GenBank metadata)", "type": "categorical" }, { "key": "is_reference", - "title": "WHO reference", + "title": "WHO Reference", "type": "categorical" - }, - { - "key": "clade_membership", - "title": "Genotype (augur clades)", - "type": "categorical" } ], "geo_resolutions": [ @@ -47,11 +47,13 @@ "region" ], "display_defaults": { - "map_triplicate": true + "map_triplicate": true, + "color_by": "clade_membership" }, "filters": [ - "country", + "clade_membership", "region", + "country", "author" ], "metadata_columns": [ From 1fd8aec29e1cf704656018aefcc09676d4677af1 Mon Sep 17 00:00:00 2001 From: Kim Andrews <17375001+kimandrews@users.noreply.github.com> Date: Thu, 9 May 2024 13:22:40 -0700 Subject: [PATCH 8/8] Update Changelog --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index 7c342b3..fbe1472 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,4 +1,5 @@ # CHANGELOG +* 9 May 2024: Create a N450 tree that can be used as part of a Nextclade dataset to assign genotypes to measles samples based on criteria outlined by the WHO [PR #28](https://github.com/nextstrain/measles/pull/28) * 25 April 2024: Add specific sequences and metadata to the measles trees, including WHO reference sequences, vaccine strains, and genotypes reported on NCBI [PR #26](https://github.com/nextstrain/measles/pull/26) * 10 April 2024: Add a single GH Action workflow to automate the ingest and phylogenetic workflows [PR #22](https://github.com/nextstrain/measles/pull/22) * 2 April 2024: Add nextstrain-automation build-configs for deploying the final Auspice dataset of the phylogenetic workflow [PR #21](https://github.com/nextstrain/measles/pull/21)