From 6b9a688a30b1ea6908ffa8508aff7c8f1c6bf831 Mon Sep 17 00:00:00 2001
From: Kim Andrews <17375001+kimandrews@users.noreply.github.com>
Date: Thu, 25 Apr 2024 17:09:46 -0700
Subject: [PATCH 1/8] Copy files from `phylogenetic` to `nextclade`

Creates a nextclade directory and copies relevant files from phylogenetic
directory to nextclade directory.
Subsequent commits will change these for the Nextclade Dataset tree workflow.
---
 nextclade/Snakefile                           | 27 ++++++
 nextclade/defaults/auspice_config.json        | 55 ++++++++++++
 nextclade/defaults/colors.tsv                 | 31 +++++++
 nextclade/defaults/config.yaml                | 29 ++++++
 nextclade/defaults/dropped_strains.txt        |  2 +
 nextclade/defaults/include_strains.txt        | 40 +++++++++
 .../defaults/measles_reference_N450.fasta     |  8 ++
 nextclade/defaults/measles_reference_N450.gb  | 69 +++++++++++++++
 nextclade/rules/annotate_phylogeny.smk        | 41 +++++++++
 nextclade/rules/construct_phylogeny.smk       | 56 ++++++++++++
 nextclade/rules/export.smk                    | 35 ++++++++
 nextclade/rules/prepare_sequences.smk         | 88 +++++++++++++++++++
 12 files changed, 481 insertions(+)
 create mode 100644 nextclade/Snakefile
 create mode 100644 nextclade/defaults/auspice_config.json
 create mode 100644 nextclade/defaults/colors.tsv
 create mode 100644 nextclade/defaults/config.yaml
 create mode 100644 nextclade/defaults/dropped_strains.txt
 create mode 100644 nextclade/defaults/include_strains.txt
 create mode 100644 nextclade/defaults/measles_reference_N450.fasta
 create mode 100644 nextclade/defaults/measles_reference_N450.gb
 create mode 100644 nextclade/rules/annotate_phylogeny.smk
 create mode 100644 nextclade/rules/construct_phylogeny.smk
 create mode 100644 nextclade/rules/export.smk
 create mode 100644 nextclade/rules/prepare_sequences.smk

diff --git a/nextclade/Snakefile b/nextclade/Snakefile
new file mode 100644
index 0000000..b4521ff
--- /dev/null
+++ b/nextclade/Snakefile
@@ -0,0 +1,27 @@
+genes = ['N450', 'genome']
+
+configfile: "defaults/config.yaml" 
+
+rule all:
+    input:
+        auspice_json = expand("auspice/measles_{gene}.json", gene=genes)
+
+include: "rules/prepare_sequences.smk"
+include: "rules/prepare_sequences_N450.smk"
+include: "rules/construct_phylogeny.smk"
+include: "rules/annotate_phylogeny.smk"
+include: "rules/export.smk"
+
+# Include custom rules defined in the config.
+if "custom_rules" in config:
+    for rule_file in config["custom_rules"]:
+
+        include: rule_file
+
+rule clean:
+    """Removing directories: {params}"""
+    params:
+        "results ",
+        "auspice"
+    shell:
+        "rm -rfv {params}"
diff --git a/nextclade/defaults/auspice_config.json b/nextclade/defaults/auspice_config.json
new file mode 100644
index 0000000..d0e92c6
--- /dev/null
+++ b/nextclade/defaults/auspice_config.json
@@ -0,0 +1,55 @@
+{
+  "title": "Real-time tracking of measles virus evolution",
+  "maintainers": [
+    {"name": "Kim Andrews", "url": "https://bedford.io/team/kim-andrews/"},
+    {"name": "the Nextstrain team", "url": "https://nextstrain.org/team"}
+  ],
+  "build_url": "https://github.com/nextstrain/measles",
+  "colorings": [
+    {
+      "key": "gt",
+      "title": "Genotype",
+      "type": "categorical"
+    },
+    {
+      "key": "num_date",
+      "title": "Date",
+      "type": "continuous"
+    },
+    {
+      "key": "country",
+      "title": "Country",
+      "type": "categorical"
+    },
+    {
+      "key": "region",
+      "title": "Region",
+      "type": "categorical"
+    },
+    {
+      "key": "genotype_ncbi",
+      "title": "Genotype (NCBI)",
+      "type": "categorical"
+    },
+    {
+      "key": "is_reference",
+      "title": "WHO reference",
+      "type": "categorical"
+    }
+  ],
+  "geo_resolutions": [
+    "country",
+    "region"
+  ],
+  "display_defaults": {
+    "map_triplicate": true
+  },
+  "filters": [
+    "country",
+    "region",
+    "author"
+  ],
+  "metadata_columns": [
+    "author"
+  ]
+}
diff --git a/nextclade/defaults/colors.tsv b/nextclade/defaults/colors.tsv
new file mode 100644
index 0000000..548f45a
--- /dev/null
+++ b/nextclade/defaults/colors.tsv
@@ -0,0 +1,31 @@
+region	Asia	#447CCD
+region	Oceania	#5EA9A1
+region	Africa	#8ABB6A
+region	Europe	#BEBB48
+region	South America	#E29E39
+region	North America	#E2562B
+
+genotype_ncbi	A	#5E1D9D
+genotype_ncbi	B1	#4B26B1
+genotype_ncbi	B2	#4138C3
+genotype_ncbi	B3	#3F4FCC
+genotype_ncbi	C1	#4065CF
+genotype_ncbi	C2	#447ACD
+genotype_ncbi	D1	#4A8BC3
+genotype_ncbi	D2	#529AB6
+genotype_ncbi	D3	#5BA6A6
+genotype_ncbi	D4	#66AE95
+genotype_ncbi	D5	#73B583
+genotype_ncbi	D6	#81B973
+genotype_ncbi	D7	#91BC64
+genotype_ncbi	D8	#A1BE58
+genotype_ncbi	D9	#B1BD4E
+genotype_ncbi	D10	#C0BA47
+genotype_ncbi	D11	#CEB541
+genotype_ncbi	E	#DAAD3D
+genotype_ncbi	F	#E19F3A
+genotype_ncbi	G1	#E68E36
+genotype_ncbi	G2	#E67832
+genotype_ncbi	G3	#E35F2D
+genotype_ncbi	H1	#DF4328
+genotype_ncbi	H2	#DB2823
diff --git a/nextclade/defaults/config.yaml b/nextclade/defaults/config.yaml
new file mode 100644
index 0000000..aa785ee
--- /dev/null
+++ b/nextclade/defaults/config.yaml
@@ -0,0 +1,29 @@
+strain_id_field: "accession"
+files:
+    exclude: "defaults/dropped_strains.txt"
+    include_genome: "defaults/include_strains_genome.txt"
+    include_N450: "defaults/include_strains_N450.txt"
+    reference: "defaults/measles_reference.gb"
+    reference_N450: "defaults/measles_reference_N450.gb"
+    reference_N450_fasta: "defaults/measles_reference_N450.fasta"
+    colors: "defaults/colors.tsv"
+    auspice_config: "defaults/auspice_config.json"
+    auspice_config_N450: "defaults/auspice_config_N450.json"
+filter:
+    group_by: "country year"
+    sequences_per_group: 20
+    min_date: 1950
+    min_length: 5000
+filter_N450:
+    group_by: "country year"
+    subsample_max_sequences: 3000
+    min_date: 1950
+    min_length: 400
+refine:
+    coalescent: "opt"
+    date_inference: "marginal"
+    clock_filter_iqd: 4
+ancestral:
+    inference: "joint"
+export:
+    metadata_columns: "strain division location"
diff --git a/nextclade/defaults/dropped_strains.txt b/nextclade/defaults/dropped_strains.txt
new file mode 100644
index 0000000..7983808
--- /dev/null
+++ b/nextclade/defaults/dropped_strains.txt
@@ -0,0 +1,2 @@
+HM562901 # temara.MOR/24.03
+HM562900 # Mvs/Toulon.FRA/08.07
\ No newline at end of file
diff --git a/nextclade/defaults/include_strains.txt b/nextclade/defaults/include_strains.txt
new file mode 100644
index 0000000..8e49e3d
--- /dev/null
+++ b/nextclade/defaults/include_strains.txt
@@ -0,0 +1,40 @@
+# Vaccine strain information from Parks et al. Comparison of predicted amino acid
+# sequences of measles virus strains in the Edmonston vaccine lineage
+# https://doi.org/10.1128/jvi.75.2.910-920.2001
+AF266288
+AF266287
+AF266290
+AF266289
+AF266291
+AF266286
+#
+# WHO genotype reference strains
+# Information from https://www.who.int/publications/i/item/WER8709
+AF045212
+AF045217
+AF079555
+AF171232
+AF243450
+AF280803
+AF481485
+AJ232203
+AY037020
+AY043459
+AY184217
+AY923185
+D01005
+GU440571
+L46750
+L46753
+L46758
+M89921
+U01974
+U01976
+U01977
+U01987
+U01994
+U01998
+U64582
+X84865
+X84872
+X84879
diff --git a/nextclade/defaults/measles_reference_N450.fasta b/nextclade/defaults/measles_reference_N450.fasta
new file mode 100644
index 0000000..eab1954
--- /dev/null
+++ b/nextclade/defaults/measles_reference_N450.fasta
@@ -0,0 +1,8 @@
+>lcl|NC_001498.1_cds_NP_056918.1_1 [gene=N] [locus_tag=MeVgp1] [db_xref=GeneID:1489804] [protein=nucleocapsid protein] [protein_id=NP_056918.1] [location=1233..1682] [gbkey=CDS]
+GTCAGTTCCACATTGGCATCCGAACTCGGTATCACTGCCGAGGATGCAAGGCTTGTTTCAGAGAT
+TGCAATGCATACTACTGAGGACAGGATCAGTAGAGCGGTCGGACCCAGACAAGCCCAAGTGTCATTTCTA
+CACGGTGATCAAAGTGAGAATGAGCTACCAGGATTGGGGGGCAAGGAAGATAGGAGGGTCAAACAGGGTC
+GGGGAGAAGCCAGGGAGAGCTACAGAGAAACCGGGTCCAGCAGAGCAAGTGATGCGAGAGCTGCCCATCC
+TCCAACCAGCATGCCCCTAGACATTGACACTGCATCGGAGTCAGGCCAAGATCCGCAGGACAGTCGAAGG
+TCAGCTGACGCCCTGCTCAGGCTGCAAGCCATGGCAGGAATCTTGGAAGAACAAGGCTCAGACACGGACA
+CCCCTAGGGTATACAATGACAGAGATCTTCTAGAC
diff --git a/nextclade/defaults/measles_reference_N450.gb b/nextclade/defaults/measles_reference_N450.gb
new file mode 100644
index 0000000..277c96a
--- /dev/null
+++ b/nextclade/defaults/measles_reference_N450.gb
@@ -0,0 +1,69 @@
+LOCUS       NC_001498                450 bp    cRNA    linear   VRL 13-AUG-2018
+DEFINITION  Measles virus, complete genome.
+ACCESSION   NC_001498 REGION: 1233..1682
+VERSION     NC_001498.1
+DBLINK      Project: 15025
+            BioProject: PRJNA485481
+KEYWORDS    RefSeq.
+SOURCE      Measles morbillivirus
+  ORGANISM  Measles morbillivirus
+            Viruses; Riboviria; Orthornavirae; Negarnaviricota;
+            Haploviricotina; Monjiviricetes; Mononegavirales; Paramyxoviridae;
+            Orthoparamyxovirinae; Morbillivirus; Morbillivirus hominis.
+REFERENCE   1  (sites)
+  AUTHORS   Rima,B.K. and Duprex,W.P.
+  TITLE     The measles virus replication cycle
+  JOURNAL   Curr. Top. Microbiol. Immunol. 329, 77-102 (2009)
+   PUBMED   19198563
+REFERENCE   2
+  AUTHORS   Takeuchi,K., Miyajima,N., Kobune,F. and Tashiro,M.
+  TITLE     Comparative nucleotide sequence analyses of the entire genomes of
+            B95a cell-isolated and vero cell-isolated measles viruses from the
+            same patient
+  JOURNAL   Virus Genes 20 (3), 253-257 (2000)
+   PUBMED   10949953
+REFERENCE   3  (bases 1 to 450)
+  CONSRTM   NCBI Genome Project
+  TITLE     Direct Submission
+  JOURNAL   Submitted (01-AUG-2000) National Center for Biotechnology
+            Information, NIH, Bethesda, MD 20894, USA
+REFERENCE   4  (bases 1 to 450)
+  AUTHORS   Takeuchi,K., Tanabayashi,K. and Tashiro,M.
+  TITLE     Direct Submission
+  JOURNAL   Submitted (10-JUL-1998) Kaoru Takeuchi, National Institute of
+            Infectious Diseases, Viral Disease and Vaccine Contorol; 4-7-1
+            Gakuen, Musashi-murayama, Tokyo 208-0011, Japan
+            (E-mail:ktake@nih.go.jp, Tel:81-42-561-0771(ex.530),
+            Fax:81-42-567-5631)
+COMMENT     REVIEWED REFSEQ: This record has been curated by NCBI staff. The
+            reference sequence was derived from AB016162.
+            Sequence updated (21-Jul-1998)
+            Sequence updated (11-Dec-1998).
+            COMPLETENESS: full length.
+FEATURES             Location/Qualifiers
+     source          1..450
+                     /organism="Measles morbillivirus"
+                     /mol_type="viral cRNA"
+                     /strain="Ichinose-B95a"
+                     /db_xref="taxon:11234"
+     CDS             <1..>450
+                     /gene="N"
+                     /codon_start=1
+                     /product="nucleocapsid protein"
+                     /protein_id="NP_056918.1"
+                     /db_xref="GeneID:1489804"
+                     /translation="VSSTLASELGITAEDAR
+                     LVSEIAMHTTEDRISRAVGPRQAQVSFLHGDQSENELPGLGGKEDRRVKQGRGEARES
+                     YRETGSSRASDARAAHPPTSMPLDIDTASESGQDPQDSRRSADALLRLQAMAGILEEQ
+                     GSDTDTPRVYNDRDLLD"
+ORIGIN      
+        1 gtcagttcca cattggcatc cgaactcggt atcactgccg aggatgcaag gcttgtttca
+       61 gagattgcaa tgcatactac tgaggacagg atcagtagag cggtcggacc cagacaagcc
+      121 caagtgtcat ttctacacgg tgatcaaagt gagaatgagc taccaggatt ggggggcaag
+      181 gaagatagga gggtcaaaca gggtcgggga gaagccaggg agagctacag agaaaccggg
+      241 tccagcagag caagtgatgc gagagctgcc catcctccaa ccagcatgcc cctagacatt
+      301 gacactgcat cggagtcagg ccaagatccg caggacagtc gaaggtcagc tgacgccctg
+      361 ctcaggctgc aagccatggc aggaatcttg gaagaacaag gctcagacac ggacacccct
+      421 agggtataca atgacagaga tcttctagac
+//
+
diff --git a/nextclade/rules/annotate_phylogeny.smk b/nextclade/rules/annotate_phylogeny.smk
new file mode 100644
index 0000000..61e94b8
--- /dev/null
+++ b/nextclade/rules/annotate_phylogeny.smk
@@ -0,0 +1,41 @@
+"""
+This part of the workflow creates additonal annotations for the phylogenetic tree.
+
+See Augur's usage docs for these commands for more details.
+
+"""
+
+rule ancestral:
+    """Reconstructing ancestral sequences and mutations"""
+    input:
+        tree = "results/{gene}/tree.nwk",
+        alignment = "results/{gene}/aligned.fasta"
+    output:
+        node_data = "results/{gene}/nt_muts.json"
+    params:
+        inference = config["ancestral"]["inference"]
+    shell:
+        """
+        augur ancestral \
+            --tree {input.tree} \
+            --alignment {input.alignment} \
+            --output-node-data {output.node_data} \
+            --inference {params.inference}
+        """
+
+rule translate:
+    """Translating amino acid sequences"""
+    input:
+        tree = "results/{gene}/tree.nwk",
+        node_data = "results/{gene}/nt_muts.json",
+        reference = lambda wildcard: "defaults/measles_reference.gb" if wildcard.gene in ["genome"] else "defaults/measles_reference_{gene}.gb"
+    output:
+        node_data = "results/{gene}/aa_muts.json"
+    shell:
+        """
+        augur translate \
+            --tree {input.tree} \
+            --ancestral-sequences {input.node_data} \
+            --reference-sequence {input.reference} \
+            --output {output.node_data} \
+        """
diff --git a/nextclade/rules/construct_phylogeny.smk b/nextclade/rules/construct_phylogeny.smk
new file mode 100644
index 0000000..e222800
--- /dev/null
+++ b/nextclade/rules/construct_phylogeny.smk
@@ -0,0 +1,56 @@
+"""
+This part of the workflow constructs the phylogenetic tree.
+
+See Augur's usage docs for these commands for more details.
+"""
+
+rule tree:
+    """Building tree"""
+    input:
+        alignment = "results/{gene}/aligned.fasta"
+    output:
+        tree = "results/{gene}/tree_raw.nwk"
+    shell:
+        """
+        augur tree \
+            --alignment {input.alignment} \
+            --output {output.tree}
+        """
+
+rule refine:
+    """
+    Refining tree
+      - estimate timetree
+      - use {params.coalescent} coalescent timescale
+      - estimate {params.date_inference} node dates
+      - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation
+    """
+    input:
+        tree = "results/{gene}/tree_raw.nwk",
+        alignment = "results/{gene}/aligned.fasta",
+        metadata = "data/metadata.tsv"
+    output:
+        tree = "results/{gene}/tree.nwk",
+        node_data = "results/{gene}/branch_lengths.json"
+    params:
+        coalescent = config["refine"]["coalescent"],
+        date_inference = config["refine"]["date_inference"],
+        clock_filter_iqd = config["refine"]["clock_filter_iqd"],
+        strain_id = config["strain_id_field"]
+    shell:
+        """
+        augur refine \
+            --tree {input.tree} \
+            --alignment {input.alignment} \
+            --metadata {input.metadata} \
+            --metadata-id-columns {params.strain_id} \
+            --output-tree {output.tree} \
+            --output-node-data {output.node_data} \
+            --timetree \
+            --coalescent {params.coalescent} \
+            --date-confidence \
+            --date-inference {params.date_inference} \
+            --clock-filter-iqd {params.clock_filter_iqd} \
+            --stochastic-resolve
+        """
+        
\ No newline at end of file
diff --git a/nextclade/rules/export.smk b/nextclade/rules/export.smk
new file mode 100644
index 0000000..96dfed5
--- /dev/null
+++ b/nextclade/rules/export.smk
@@ -0,0 +1,35 @@
+"""
+This part of the workflow collects the phylogenetic tree and annotations to
+export a Nextstrain dataset.
+
+See Augur's usage docs for these commands for more details.
+"""
+
+rule export:
+    """Exporting data files for for auspice"""
+    input:
+        tree = "results/{gene}/tree.nwk",
+        metadata = "data/metadata.tsv",
+        branch_lengths = "results/{gene}/branch_lengths.json",
+        nt_muts = "results/{gene}/nt_muts.json",
+        aa_muts = "results/{gene}/aa_muts.json",
+        colors = config["files"]["colors"],
+        auspice_config = lambda wildcard: "defaults/auspice_config.json" if wildcard.gene in ["genome"] else "defaults/auspice_config_N450.json"
+    output:
+        auspice_json = "auspice/measles_{gene}.json"
+    params:
+        strain_id = config["strain_id_field"],
+        metadata_columns = config["export"]["metadata_columns"]
+    shell:
+        """
+        augur export v2 \
+            --tree {input.tree} \
+            --metadata {input.metadata} \
+            --metadata-id-columns {params.strain_id} \
+            --node-data {input.branch_lengths} {input.nt_muts} {input.aa_muts} \
+            --colors {input.colors} \
+            --metadata-columns {params.metadata_columns} \
+            --auspice-config {input.auspice_config} \
+            --include-root-sequence-inline \
+            --output {output.auspice_json}
+        """
diff --git a/nextclade/rules/prepare_sequences.smk b/nextclade/rules/prepare_sequences.smk
new file mode 100644
index 0000000..508fcc7
--- /dev/null
+++ b/nextclade/rules/prepare_sequences.smk
@@ -0,0 +1,88 @@
+"""
+This part of the workflow prepares sequences for constructing the phylogenetic tree.
+
+See Augur's usage docs for these commands for more details.
+"""
+rule download:
+    """Downloading sequences and metadata from data.nextstrain.org"""
+    output:
+        sequences = "data/sequences.fasta.zst",
+        metadata = "data/metadata.tsv.zst"
+    params:
+        sequences_url = "https://data.nextstrain.org/files/workflows/measles/sequences.fasta.zst",
+        metadata_url = "https://data.nextstrain.org/files/workflows/measles/metadata.tsv.zst"
+    shell:
+        """
+        curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences}
+        curl -fsSL --compressed {params.metadata_url:q} --output {output.metadata}
+        """
+
+rule decompress:
+    """Decompressing sequences and metadata"""
+    input:
+        sequences = "data/sequences.fasta.zst",
+        metadata = "data/metadata.tsv.zst"
+    output:
+        sequences = "data/sequences.fasta",
+        metadata = "data/metadata.tsv"
+    shell:
+        """
+        zstd -d -c {input.sequences} > {output.sequences}
+        zstd -d -c {input.metadata} > {output.metadata}
+        """
+
+rule filter:
+    """
+    Filtering to
+      - {params.sequences_per_group} sequence(s) per {params.group_by!s}
+      - from {params.min_date} onwards
+      - excluding strains in {input.exclude}
+      - minimum genome length of {params.min_length}
+    """
+    input:
+        sequences = "data/sequences.fasta",
+        metadata = "data/metadata.tsv",
+        exclude = config["files"]["exclude"],
+        include = config["files"]["include_genome"]
+    output:
+        sequences = "results/genome/filtered.fasta"
+    params:
+        group_by = config["filter"]["group_by"],
+        sequences_per_group = config["filter"]["sequences_per_group"],
+        min_date = config["filter"]["min_date"],
+        min_length = config["filter"]["min_length"],
+        strain_id = config["strain_id_field"]
+    shell:
+        """
+        augur filter \
+            --sequences {input.sequences} \
+            --metadata {input.metadata} \
+            --metadata-id-columns {params.strain_id} \
+            --exclude {input.exclude} \
+            --output {output.sequences} \
+            --group-by {params.group_by} \
+            --sequences-per-group {params.sequences_per_group} \
+            --min-date {params.min_date} \
+            --min-length {params.min_length} 
+        """
+
+rule align:
+    """
+    Aligning sequences to {input.reference}
+      - filling gaps with N
+    """
+    input:
+        sequences = "results/genome/filtered.fasta",
+        reference = config["files"]["reference"]
+    output:
+        alignment = "results/genome/aligned.fasta"
+    shell:
+        """
+        augur align \
+            --sequences {input.sequences} \
+            --reference-sequence {input.reference} \
+            --output {output.alignment} \
+            --fill-gaps \
+            --remove-reference
+        """
+        
\ No newline at end of file

From 449aec00309d72020956292560315434b810f661 Mon Sep 17 00:00:00 2001
From: Kim Andrews <17375001+kimandrews@users.noreply.github.com>
Date: Sat, 27 Apr 2024 15:53:34 -0700
Subject: [PATCH 2/8] Prepare sequences for Nextclade Dataset tree

* Add rule to align sequences to N450 reference using Nextclade,
following https://github.com/nextstrain/measles/pull/20/commits/edbefd52b3222ba92cd7fb6b9bbf6f9d3b72a26a
* Filter to produce a sample set appropriate for a Nextclade Dataset tree
---
 nextclade/Snakefile                    |  1 -
 nextclade/defaults/config.yaml         | 17 +++----
 nextclade/defaults/dropped_strains.txt | 31 ++++++++++++-
 nextclade/defaults/include_strains.txt | 13 ++++++
 nextclade/rules/prepare_sequences.smk  | 64 ++++++++++++--------------
 5 files changed, 79 insertions(+), 47 deletions(-)

diff --git a/nextclade/Snakefile b/nextclade/Snakefile
index b4521ff..1bcbf15 100644
--- a/nextclade/Snakefile
+++ b/nextclade/Snakefile
@@ -7,7 +7,6 @@ rule all:
         auspice_json = expand("auspice/measles_{gene}.json", gene=genes)
 
 include: "rules/prepare_sequences.smk"
-include: "rules/prepare_sequences_N450.smk"
 include: "rules/construct_phylogeny.smk"
 include: "rules/annotate_phylogeny.smk"
 include: "rules/export.smk"
diff --git a/nextclade/defaults/config.yaml b/nextclade/defaults/config.yaml
index aa785ee..bb67af5 100644
--- a/nextclade/defaults/config.yaml
+++ b/nextclade/defaults/config.yaml
@@ -1,22 +1,17 @@
 strain_id_field: "accession"
 files:
     exclude: "defaults/dropped_strains.txt"
-    include_genome: "defaults/include_strains_genome.txt"
-    include_N450: "defaults/include_strains_N450.txt"
-    reference: "defaults/measles_reference.gb"
+    include: "defaults/include_strains.txt"
     reference_N450: "defaults/measles_reference_N450.gb"
     reference_N450_fasta: "defaults/measles_reference_N450.fasta"
     colors: "defaults/colors.tsv"
     auspice_config: "defaults/auspice_config.json"
-    auspice_config_N450: "defaults/auspice_config_N450.json"
+align_and_extract_N450:
+    min_length: 400
+    min_seed_cover: 0.01
 filter:
-    group_by: "country year"
-    sequences_per_group: 20
-    min_date: 1950
-    min_length: 5000
-filter_N450:
-    group_by: "country year"
-    subsample_max_sequences: 3000
+    group_by: "region genotype_ncbi year"
+    subsample_max_sequences: 500
     min_date: 1950
     min_length: 400
 refine:
diff --git a/nextclade/defaults/dropped_strains.txt b/nextclade/defaults/dropped_strains.txt
index 7983808..e48b51d 100644
--- a/nextclade/defaults/dropped_strains.txt
+++ b/nextclade/defaults/dropped_strains.txt
@@ -1,2 +1,31 @@
 HM562901 # temara.MOR/24.03
-HM562900 # Mvs/Toulon.FRA/08.07
\ No newline at end of file
+HM562900 # Mvs/Toulon.FRA/08.07
+#
+# Incorrect genotypes reported on NCBI:
+KY941950
+KX610825
+KX603680
+KX455930
+KX420624
+KX024471
+KP856709
+KP734120
+KP734117
+KP056769
+KM017095
+KJ556875
+KJ556859
+KC139079
+JX556685
+JX556684
+JN005809
+GU937234
+AF410987
+#
+# Genotype assignment is ambiguous based on tree structure:
+KY678430
+AB453044
+#
+# Clock rate outliers:
+AB573812
+HM562905
diff --git a/nextclade/defaults/include_strains.txt b/nextclade/defaults/include_strains.txt
index 8e49e3d..57c14f0 100644
--- a/nextclade/defaults/include_strains.txt
+++ b/nextclade/defaults/include_strains.txt
@@ -38,3 +38,16 @@ U64582
 X84865
 X84872
 X84879
+#
+# Rare genotypes
+# Including these to boost representation of these genotypes in the nextclade tree
+AF410989  #Rare genotype: E
+MG912591  #Rare genotype: G2
+AY037009  #Rare genotype: G2
+AY037043  #Rare genotype: H2
+AY037026  #Rare genotype: H2
+AY037028  #Rare genotype: D2
+FJ668380  #Rare genotype: D10
+MN017369  #Rare genotype: D11
+KC968467  #Rare genotype: D11
+KC968354  #Rare genotype: D11
diff --git a/nextclade/rules/prepare_sequences.smk b/nextclade/rules/prepare_sequences.smk
index 508fcc7..428a0de 100644
--- a/nextclade/rules/prepare_sequences.smk
+++ b/nextclade/rules/prepare_sequences.smk
@@ -31,24 +31,39 @@ rule decompress:
         zstd -d -c {input.metadata} > {output.metadata}
         """
 
-rule filter:
-    """
-    Filtering to
-      - {params.sequences_per_group} sequence(s) per {params.group_by!s}
-      - from {params.min_date} onwards
-      - excluding strains in {input.exclude}
-      - minimum genome length of {params.min_length}
-    """
+rule align_and_extract_N450:
     input:
         sequences = "data/sequences.fasta",
+        reference = config["files"]["reference_N450_fasta"]
+    output:
+        sequences = "results/sequences_N450.fasta"
+    params:
+        min_seed_cover = config['align_and_extract_N450']['min_seed_cover'],
+        min_length = config['align_and_extract_N450']['min_length']
+    threads: workflow.cores
+    shell:
+        """
+        nextclade3 run \
+           --jobs {threads} \
+           --input-ref {input.reference} \
+           --output-fasta {output.sequences} \
+           --min-seed-cover {params.min_seed_cover} \
+           --min-length {params.min_length} \
+           --silent \
+           {input.sequences}
+        """
+
+rule filter:
+    input:
+        sequences = "results/sequences_N450.fasta",
         metadata = "data/metadata.tsv",
         exclude = config["files"]["exclude"],
-        include = config["files"]["include_genome"]
+        include = config["files"]["include"]
     output:
-        sequences = "results/genome/filtered.fasta"
+        sequences = "results/aligned.fasta"
     params:
         group_by = config["filter"]["group_by"],
-        sequences_per_group = config["filter"]["sequences_per_group"],
+        subsample_max_sequences = config["filter"]["subsample_max_sequences"],
         min_date = config["filter"]["min_date"],
         min_length = config["filter"]["min_length"],
         strain_id = config["strain_id_field"]
@@ -59,30 +74,11 @@ rule filter:
             --metadata {input.metadata} \
             --metadata-id-columns {params.strain_id} \
             --exclude {input.exclude} \
+            --include {input.include} \
             --output {output.sequences} \
             --group-by {params.group_by} \
-            --sequences-per-group {params.sequences_per_group} \
+            --subsample-max-sequences {params.subsample_max_sequences} \
             --min-date {params.min_date} \
-            --min-length {params.min_length} 
-        """
-
-rule align:
-    """
-    Aligning sequences to {input.reference}
-      - filling gaps with N
-    """
-    input:
-        sequences = "results/genome/filtered.fasta",
-        reference = config["files"]["reference"]
-    output:
-        alignment = "results/genome/aligned.fasta"
-    shell:
-        """
-        augur align \
-            --sequences {input.sequences} \
-            --reference-sequence {input.reference} \
-            --output {output.alignment} \
-            --fill-gaps \
-            --remove-reference
+            --min-length {params.min_length} \
+            --query='genotype_ncbi!=""'
         """
-        
\ No newline at end of file

From 5e76b3102735aa2fc1857aa16623dbc94c5acb50 Mon Sep 17 00:00:00 2001
From: Kim Andrews <17375001+kimandrews@users.noreply.github.com>
Date: Sat, 27 Apr 2024 15:58:23 -0700
Subject: [PATCH 3/8] Construct phylogeny for Nextclade Dataset tree

Remove construct_phylogeny.smk wildcards that were needed for the Phylogenetic workflow but are not needed for the Nextclade Dataset tree workflow.
---
 nextclade/rules/construct_phylogeny.smk | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/nextclade/rules/construct_phylogeny.smk b/nextclade/rules/construct_phylogeny.smk
index e222800..5a34b9e 100644
--- a/nextclade/rules/construct_phylogeny.smk
+++ b/nextclade/rules/construct_phylogeny.smk
@@ -7,9 +7,9 @@ See Augur's usage docs for these commands for more details.
 rule tree:
     """Building tree"""
     input:
-        alignment = "results/{gene}/aligned.fasta"
+        alignment = "results/aligned.fasta"
     output:
-        tree = "results/{gene}/tree_raw.nwk"
+        tree = "results/tree_raw.nwk"
     shell:
         """
         augur tree \
@@ -26,12 +26,12 @@ rule refine:
       - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation
     """
     input:
-        tree = "results/{gene}/tree_raw.nwk",
-        alignment = "results/{gene}/aligned.fasta",
+        tree = "results/tree_raw.nwk",
+        alignment = "results/aligned.fasta",
         metadata = "data/metadata.tsv"
     output:
-        tree = "results/{gene}/tree.nwk",
-        node_data = "results/{gene}/branch_lengths.json"
+        tree = "results/tree.nwk",
+        node_data = "results/branch_lengths.json"
     params:
         coalescent = config["refine"]["coalescent"],
         date_inference = config["refine"]["date_inference"],

From df0ba96f91581b1fb2629fc476a0ef3922f1f0dd Mon Sep 17 00:00:00 2001
From: Kim Andrews <17375001+kimandrews@users.noreply.github.com>
Date: Sat, 27 Apr 2024 16:23:58 -0700
Subject: [PATCH 4/8] Annotate phylogeny for Nextclade Dataset tree

* Use `augur ancestral` option `--root-sequence` to add mutations from the reference sequence to the root of the tree, which is required for Nextclade Dataset trees, as described here: https://github.com/nextstrain/nextclade_data/blob/577696e4ee0f0dff925d3911d27351a7395fab3d/docs/dataset-creation-guide.md
* Use `augur clades` to assign genotypes to nodes
* Remove annotate_phylogeny.smk wildcards that were needed for the Phylogenetic workflow but are not needed for the Nextclade Dataset tree workflow.
---
 nextclade/defaults/clades.tsv          | 64 ++++++++++++++++++++++++++
 nextclade/defaults/config.yaml         |  1 +
 nextclade/rules/annotate_phylogeny.smk | 37 +++++++++++----
 3 files changed, 93 insertions(+), 9 deletions(-)
 create mode 100644 nextclade/defaults/clades.tsv

diff --git a/nextclade/defaults/clades.tsv b/nextclade/defaults/clades.tsv
new file mode 100644
index 0000000..2c4261b
--- /dev/null
+++ b/nextclade/defaults/clades.tsv
@@ -0,0 +1,64 @@
+clade	gene	site	alt
+A	nuc	89	A
+A	nuc	126	A
+A	nuc	422	T
+A	nuc	439	A
+B1	nuc	36	G
+B1	nuc	79	G
+B1	nuc	111	T
+B1	nuc	261	A
+B1	nuc	279	G
+B2	nuc	323	T
+B3	nuc	351	C
+C1	nuc	75	C
+C1	nuc	246	T
+C1	nuc	317	T
+C1	nuc	354	A
+C2	nuc	81	A
+C2	nuc	118	T
+D1	nuc	23	G
+D1	nuc	45	C
+D1	nuc	330	T
+D10	nuc	123	G
+D10	nuc	218	T
+D10	nuc	323	G
+D10	nuc	444	A
+D11	nuc	54	A
+D11	nuc	214	T
+D2	nuc	207	G
+D2	nuc	255	C
+D2	nuc	15	A
+D2	nuc	250	A
+D3	nuc	21	C
+D3	nuc	186	C
+D3	nuc	275	C
+D3	nuc	287	T
+D4	nuc	133	A
+D4	nuc	367	T
+D4	nuc	416	T
+D5	nuc	48	G
+D5	nuc	105	C
+D6	nuc	316	A
+D7	nuc	339	C
+D7	nuc	343	C
+D8	nuc	226	A
+D8	nuc	251	T
+D9	nuc	216	A
+E	nuc	224	T
+E	nuc	329	A
+E	nuc	341	A
+F	nuc	222	A
+F	nuc	297	T
+F	nuc	332	T
+G1	nuc	27	T
+G1	nuc	177	G
+G1	nuc	419	T
+G2	nuc	242	A
+G2	nuc	271	G
+G3	nuc	3	A
+H1	nuc	126	C
+H1	nuc	224	A
+H2	nuc	84	A
+H2	nuc	171	A
+H2	nuc	245	A
+H2	nuc	387	T
diff --git a/nextclade/defaults/config.yaml b/nextclade/defaults/config.yaml
index bb67af5..41895a9 100644
--- a/nextclade/defaults/config.yaml
+++ b/nextclade/defaults/config.yaml
@@ -4,6 +4,7 @@ files:
     include: "defaults/include_strains.txt"
     reference_N450: "defaults/measles_reference_N450.gb"
     reference_N450_fasta: "defaults/measles_reference_N450.fasta"
+    clades: "defaults/clades.tsv"
     colors: "defaults/colors.tsv"
     auspice_config: "defaults/auspice_config.json"
 align_and_extract_N450:
diff --git a/nextclade/rules/annotate_phylogeny.smk b/nextclade/rules/annotate_phylogeny.smk
index 61e94b8..6ca5ec3 100644
--- a/nextclade/rules/annotate_phylogeny.smk
+++ b/nextclade/rules/annotate_phylogeny.smk
@@ -8,29 +8,31 @@ See Augur's usage docs for these commands for more details.
 rule ancestral:
     """Reconstructing ancestral sequences and mutations"""
     input:
-        tree = "results/{gene}/tree.nwk",
-        alignment = "results/{gene}/aligned.fasta"
+        tree = "results/tree.nwk",
+        alignment = "results/aligned.fasta"
     output:
-        node_data = "results/{gene}/nt_muts.json"
+        node_data = "results/nt_muts.json"
     params:
-        inference = config["ancestral"]["inference"]
+        inference = config["ancestral"]["inference"],
+        reference_fasta = config["files"]["reference_N450_fasta"]
     shell:
         """
         augur ancestral \
             --tree {input.tree} \
             --alignment {input.alignment} \
             --output-node-data {output.node_data} \
-            --inference {params.inference}
+            --inference {params.inference}  \
+            --root-sequence {params.reference_fasta}
         """
 
 rule translate:
     """Translating amino acid sequences"""
     input:
-        tree = "results/{gene}/tree.nwk",
-        node_data = "results/{gene}/nt_muts.json",
-        reference = lambda wildcard: "defaults/measles_reference.gb" if wildcard.gene in ["genome"] else "defaults/measles_reference_{gene}.gb"
+        tree = "results/tree.nwk",
+        node_data = "results/nt_muts.json",
+        reference = config["files"]["reference_N450"]
     output:
-        node_data = "results/{gene}/aa_muts.json"
+        node_data = "results/aa_muts.json"
     shell:
         """
         augur translate \
@@ -39,3 +41,20 @@ rule translate:
             --reference-sequence {input.reference} \
             --output {output.node_data} \
         """
+
+rule clades:
+    input:
+        tree = "results/tree.nwk",
+        nt_muts = "results/nt_muts.json",
+        aa_muts = "results/aa_muts.json",
+        clade_defs = config["files"]["clades"]
+    output:
+        clades = "results/clades.json"
+    shell:
+        """
+        augur clades \
+            --tree {input.tree} \
+            --mutations {input.nt_muts} {input.aa_muts} \
+            --clades {input.clade_defs} \
+            --output {output.clades}
+        """

From df7c7f044378b96e928da981754a7072c37ecd8b Mon Sep 17 00:00:00 2001
From: Kim Andrews <17375001+kimandrews@users.noreply.github.com>
Date: Sat, 27 Apr 2024 16:33:33 -0700
Subject: [PATCH 5/8] Export auspice json for Nextclade Dataset tree

* Include `augur clades` genotype assignments in exported tree and in coloring options
* Remove export.smk wildcards that were needed for the Phylogenetic workflow but are not needed for the Nextclade Dataset tree workflow.
---
 nextclade/Snakefile                    |  4 +---
 nextclade/defaults/auspice_config.json |  5 +++++
 nextclade/defaults/colors.tsv          | 30 +++++++++++++++++++++++++-
 nextclade/rules/export.smk             | 17 ++++++++-------
 phylogenetic/defaults/colors.tsv       |  4 +++-
 5 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/nextclade/Snakefile b/nextclade/Snakefile
index 1bcbf15..dd876ff 100644
--- a/nextclade/Snakefile
+++ b/nextclade/Snakefile
@@ -1,10 +1,8 @@
-genes = ['N450', 'genome']
-
 configfile: "defaults/config.yaml" 
 
 rule all:
     input:
-        auspice_json = expand("auspice/measles_{gene}.json", gene=genes)
+        auspice_json = "auspice/measles.json"
 
 include: "rules/prepare_sequences.smk"
 include: "rules/construct_phylogeny.smk"
diff --git a/nextclade/defaults/auspice_config.json b/nextclade/defaults/auspice_config.json
index d0e92c6..aa89017 100644
--- a/nextclade/defaults/auspice_config.json
+++ b/nextclade/defaults/auspice_config.json
@@ -35,6 +35,11 @@
       "key": "is_reference",
       "title": "WHO reference",
       "type": "categorical"
+    }, 
+    { 
+      "key": "clade_membership", 
+      "title": "Genotype (augur clades)", 
+      "type": "categorical" 
     }
   ],
   "geo_resolutions": [
diff --git a/nextclade/defaults/colors.tsv b/nextclade/defaults/colors.tsv
index 548f45a..e9fa118 100644
--- a/nextclade/defaults/colors.tsv
+++ b/nextclade/defaults/colors.tsv
@@ -1,10 +1,12 @@
+# Regions: These fields are identical to those in phylogenetic/defaults/colors.tsv. Any changes to one should also be made to the other.
 region	Asia	#447CCD
 region	Oceania	#5EA9A1
 region	Africa	#8ABB6A
 region	Europe	#BEBB48
 region	South America	#E29E39
 region	North America	#E2562B
-
+#
+# MeV Genotypes reported in NCBI GenBank metadata: These fields are identical to those in phylogenetic/defaults/colors.tsv. Any changes to one should also be made to the other.
 genotype_ncbi	A	#5E1D9D
 genotype_ncbi	B1	#4B26B1
 genotype_ncbi	B2	#4138C3
@@ -29,3 +31,29 @@ genotype_ncbi	G2	#E67832
 genotype_ncbi	G3	#E35F2D
 genotype_ncbi	H1	#DF4328
 genotype_ncbi	H2	#DB2823
+#
+# MeV Genotypes assigned by augur clades
+clade_membership	A	#5E1D9D
+clade_membership	B1	#4B26B1
+clade_membership	B2	#4138C3
+clade_membership	B3	#3F4FCC
+clade_membership	C1	#4065CF
+clade_membership	C2	#447ACD
+clade_membership	D1	#4A8BC3
+clade_membership	D2	#529AB6
+clade_membership	D3	#5BA6A6
+clade_membership	D4	#66AE95
+clade_membership	D5	#73B583
+clade_membership	D6	#81B973
+clade_membership	D7	#91BC64
+clade_membership	D8	#A1BE58
+clade_membership	D9	#B1BD4E
+clade_membership	D10	#C0BA47
+clade_membership	D11	#CEB541
+clade_membership	E	#DAAD3D
+clade_membership	F	#E19F3A
+clade_membership	G1	#E68E36
+clade_membership	G2	#E67832
+clade_membership	G3	#E35F2D
+clade_membership	H1	#DF4328
+clade_membership	H2	#DB2823
diff --git a/nextclade/rules/export.smk b/nextclade/rules/export.smk
index 96dfed5..45ac446 100644
--- a/nextclade/rules/export.smk
+++ b/nextclade/rules/export.smk
@@ -6,17 +6,18 @@ See Augur's usage docs for these commands for more details.
 """
 
 rule export:
-    """Exporting data files for for auspice"""
+    """Exporting data files for auspice"""
     input:
-        tree = "results/{gene}/tree.nwk",
+        tree = "results/tree.nwk",
         metadata = "data/metadata.tsv",
-        branch_lengths = "results/{gene}/branch_lengths.json",
-        nt_muts = "results/{gene}/nt_muts.json",
-        aa_muts = "results/{gene}/aa_muts.json",
+        branch_lengths = "results/branch_lengths.json",
+        clades = "results/clades.json",
+        nt_muts = "results/nt_muts.json",
+        aa_muts = "results/aa_muts.json",
         colors = config["files"]["colors"],
-        auspice_config = lambda wildcard: "defaults/auspice_config.json" if wildcard.gene in ["genome"] else "defaults/auspice_config_N450.json"
+        auspice_config = config["files"]["auspice_config"]
     output:
-        auspice_json = "auspice/measles_{gene}.json"
+        auspice_json = "auspice/measles.json"
     params:
         strain_id = config["strain_id_field"],
         metadata_columns = config["export"]["metadata_columns"]
@@ -26,7 +27,7 @@ rule export:
             --tree {input.tree} \
             --metadata {input.metadata} \
             --metadata-id-columns {params.strain_id} \
-            --node-data {input.branch_lengths} {input.nt_muts} {input.aa_muts} \
+            --node-data {input.branch_lengths} {input.nt_muts} {input.aa_muts} {input.clades} \
             --colors {input.colors} \
             --metadata-columns {params.metadata_columns} \
             --auspice-config {input.auspice_config} \
diff --git a/phylogenetic/defaults/colors.tsv b/phylogenetic/defaults/colors.tsv
index 548f45a..8036510 100644
--- a/phylogenetic/defaults/colors.tsv
+++ b/phylogenetic/defaults/colors.tsv
@@ -1,10 +1,12 @@
+# Regions: These fields are identical to those in nextclade/defaults/colors.tsv. Any changes to one should also be made to the other.
 region	Asia	#447CCD
 region	Oceania	#5EA9A1
 region	Africa	#8ABB6A
 region	Europe	#BEBB48
 region	South America	#E29E39
 region	North America	#E2562B
-
+#
+# MeV Genotypes reported in NCBI GenBank metadata: These fields are identical to those in nextclade/defaults/colors.tsv. Any changes to one should also be made to the other.
 genotype_ncbi	A	#5E1D9D
 genotype_ncbi	B1	#4B26B1
 genotype_ncbi	B2	#4138C3

From ba4c9be8b62b6be9dff60fba3226d827aa1a54e4 Mon Sep 17 00:00:00 2001
From: Kim Andrews <17375001+kimandrews@users.noreply.github.com>
Date: Tue, 30 Apr 2024 13:58:42 -0700
Subject: [PATCH 6/8] Add README.md

---
 nextclade/README.md | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 nextclade/README.md

diff --git a/nextclade/README.md b/nextclade/README.md
new file mode 100644
index 0000000..79bfa4f
--- /dev/null
+++ b/nextclade/README.md
@@ -0,0 +1,36 @@
+
+# Measles Nextclade Dataset Tree
+
+This workflow creates a phylogenetic tree that can be used as part of a Nextclade dataset to assign genotypes to measles samples based on [criteria outlined by the WHO](https://www.who.int/publications/i/item/WER8709). 
+
+The WHO has defined 24 measles genotypes based on N gene and H gene sequences from 28 reference strains. For new measles samples, genotypes can be assigned based on genetic similarity to the reference strains at the "N450" region (a 450 bp region of the N gene). 
+
+The tree created here includes N450 sequences for the 28 reference strains, along with other representative strains for each genotype. 
+
+The workflow includes the following steps:
+* Build a tree using samples from the `ingest` output, with the following sampling criteria:
+	* Exclude samples for which a genotype is NOT present on NCBI (indicated in the metadata column "genotype_ncbi")
+	* Force-include the following samples:
+		* WHO genotype reference strains
+		* Vaccine strains
+		* All available samples for genotypes that are poorly represented on NCBI (i.e., genotypes that have fewer than 10 samples on NCBI)
+	* Subsampling criteria:
+	  * group_by: "region genotype_ncbi year"
+      * subsample_max_sequences: 500
+	  * min_date: 1950
+      * min_length: 400
+* Assign genotypes to each sample and internal nodes of the tree with `augur clades`, using clade-defining mutations in `defaults/clades.tsv`
+* Provide the following coloring options on the tree:
+	* WHO reference strains ("True" or "False")
+	* Genotype assignment from `augur clades`
+	* Genotype assignment reported on NCBI
+
+## How to create a new tree:
+* Run the workflow: `nextstrain build .`
+* Inspect the output tree by comparing genotype assignments from the following sources:
+	* WHO reference strains
+	* `augur clades` output
+	* NCBI Datasets output
+* If unwanted samples are present in the tree, add them to `defaults/dropped_strains.tsv` and re-run the workflow
+* If any changes are needed to the clade-defining mutations, add changes to `defaults/clades.tsv` and re-run the workflow
+* Repeat as needed

From 07bf7378455a706ccd34b2055988c8e1730df50f Mon Sep 17 00:00:00 2001
From: Trevor Bedford <trevor@bedford.io>
Date: Wed, 1 May 2024 16:30:04 -0700
Subject: [PATCH 7/8] Update Auspice config for Nextclade tree

This makes a few various updates to the Auspice config for the Nextclade tree including:
1. Making "clade_membership" default coloring. For this tree, we really care about MeV Genotype more than other colorings.
2. Call this "MeV Genotype (Nextstrain)". I think important to specify MeV Genotype to distinguish from the more common SNP-level genotype.
3. Clarify with "MeV Genotype (GenBank metadata)". The previous "(NCBI)" seemed like this could be a MeV Genotype call from an NCBI tool.
4. Order as Region then Country to follow ncov
5. Include clade_membership as filter option
---
 nextclade/defaults/auspice_config.json | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/nextclade/defaults/auspice_config.json b/nextclade/defaults/auspice_config.json
index aa89017..07109d8 100644
--- a/nextclade/defaults/auspice_config.json
+++ b/nextclade/defaults/auspice_config.json
@@ -17,8 +17,8 @@
       "type": "continuous"
     },
     {
-      "key": "country",
-      "title": "Country",
+      "key": "clade_membership",
+      "title": "MeV Genotype (Nextstrain)",
       "type": "categorical"
     },
     {
@@ -26,20 +26,20 @@
       "title": "Region",
       "type": "categorical"
     },
+    {
+      "key": "country",
+      "title": "Country",
+      "type": "categorical"
+    },
     {
       "key": "genotype_ncbi",
-      "title": "Genotype (NCBI)",
+      "title": "MeV Genotype (GenBank metadata)",
       "type": "categorical"
     },
     {
       "key": "is_reference",
-      "title": "WHO reference",
+      "title": "WHO Reference",
       "type": "categorical"
-    }, 
-    { 
-      "key": "clade_membership", 
-      "title": "Genotype (augur clades)", 
-      "type": "categorical" 
     }
   ],
   "geo_resolutions": [
@@ -47,11 +47,13 @@
     "region"
   ],
   "display_defaults": {
-    "map_triplicate": true
+    "map_triplicate": true,
+    "color_by": "clade_membership"
   },
   "filters": [
-    "country",
+    "clade_membership",
     "region",
+    "country",
     "author"
   ],
   "metadata_columns": [

From 1fd8aec29e1cf704656018aefcc09676d4677af1 Mon Sep 17 00:00:00 2001
From: Kim Andrews <17375001+kimandrews@users.noreply.github.com>
Date: Thu, 9 May 2024 13:22:40 -0700
Subject: [PATCH 8/8] Update Changelog

---
 CHANGES.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGES.md b/CHANGES.md
index 7c342b3..fbe1472 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,4 +1,5 @@
 # CHANGELOG
+* 9 May 2024: Create a N450 tree that can be used as part of a Nextclade dataset to assign genotypes to measles samples based on criteria outlined by the WHO [PR #28](https://github.com/nextstrain/measles/pull/28)
 * 25 April 2024: Add specific sequences and metadata to the measles trees, including WHO reference sequences, vaccine strains, and genotypes reported on NCBI [PR #26](https://github.com/nextstrain/measles/pull/26)
 * 10 April 2024: Add a single GH Action workflow to automate the ingest and phylogenetic workflows [PR #22](https://github.com/nextstrain/measles/pull/22)
 * 2 April 2024: Add nextstrain-automation build-configs for deploying the final Auspice dataset of the phylogenetic workflow [PR #21](https://github.com/nextstrain/measles/pull/21)