Initial separation of washington state specific configs and rules

This separates the washginton-specific rules and configs from the new global workflow. Subsequent commits will tune global rules and configs, and rooting. The washginton-specific workflow can run via: nextstrain build phylogenetic --configfile build-configs/washington-state/config.yaml
nextstrain · Oct 17, 2024 · 39d468d · 39d468d
1 parent 0e0714c
commit 39d468d
Show file tree

Hide file tree

Showing 6 changed files with 112 additions and 66 deletions.
diff --git a/phylogenetic/build-configs/washington-state/config.yaml b/phylogenetic/build-configs/washington-state/config.yaml
@@ -1,11 +1,37 @@
 # This configuration file contains the custom configurations parameters
-# for the CI workflow to run with the example data.
+# for the Washington State phylogenetic build with custom rules and metadata
 
-# Pull in metadata and sequences from the example_data directory
-input_metadata: "example_data/metadata.tsv"
-input_sequences: "example_data/sequences.fasta"
+# Use 'NY99' as the reference since it should be basel to the USA sequences
+reference: "defaults/reference.gb"
+# Use 'IS88' as the root strain on the phylogenetic tree to place samples within the global context
+root: "AF481864"
+
+# Pull in metadata and sequences from the ingest directory after it has been annotated with washington-state specific metadata
+input_metadata: "../ingest/results/metadata.tsv"
+input_sequences: "../ingest/results/sequences.fasta"
+
+# This command excludes all strains by default and then forces the inclusion of
+# the strains selected by the subsampling logic defined above.
+subsampling:
+  state: --query "state == 'WA'" --min-length '9800' --subsample-max-sequences 5000
+  neighboring_state: --query "state in ['CA', 'ID', 'OR', 'NV']" --group-by state year --min-length '9800' --subsample-max-sequences 5000
+  region: --query "state in ['AZ','NM', 'CO', 'UT', 'WY', 'MT']" --group-by state year --min-length '9800' --subsample-max-sequences 5000
+  country: --query "country == 'USA' and state not in ['WA', 'CA', 'ID', 'OR', 'NV','AZ','NM', 'CO', 'UT', 'WY', 'MT'] and accession != 'NC_009942'" --group-by state year --subsample-max-sequences 300 --min-length '9800'
+  force_include: --exclude-all --include ../nextclade/defaults/include.txt
+
+traits:
+  metadata_columns: [
+    'country',
+    'division',
+    'location',
+    'clade_membership',
+    'host'
+  ]
+
+export:
+  auspice_config: "defaults/auspice_config.json"
 
 ## Custom rules to run as part of the CI automated workflow
 ## The paths should be relative to the phylogenetic directory.
-#custom_rules:
-#  - build-configs/ci/copy_example_data.smk
+custom_rules:
+  - build-configs/washington-state/washington-state-rules.smk
diff --git a/phylogenetic/build-configs/washington-state/copy_example_data.smk b/phylogenetic/build-configs/washington-state/copy_example_data.smk
diff --git a/phylogenetic/build-configs/washington-state/washington-state-rules.smk b/phylogenetic/build-configs/washington-state/washington-state-rules.smk
@@ -0,0 +1,77 @@
+"""
+These are washington specific rules for the phylogenetic workflow.
+"""
+
+rule create_lat_longs:
+    """
+    This rule creates an averaged lat_longs.tsv file from the metadata_filtered.tsv file, but this requires a USA state annotation. This rule fails on global datasets.
+    """
+    input:
+        metadata = "results/metadata_filtered.tsv"
+    output:
+        lat_longs = "results/lat_longs.tsv"
+    log:
+        "logs/lat_longs.txt",
+    benchmark:
+        "benchmarks/lat_longs.txt"
+    shell:
+        """
+        python ./scripts/create_lat_longs.py {input.metadata} {output.lat_longs} 2>&1 | tee {log}
+        """
+
+
+rule create_colors:
+    input:
+        metadata = "results/metadata_filtered.tsv"
+    output:
+        colors = "results/colors.tsv"
+    log:
+            "logs/colors.txt",
+    benchmark:
+            "benchmarks/colors.txt"
+    shell:
+        """
+        python ./scripts/make_colors.py {input.metadata} {output.colors} 2>&1 | tee {log}
+        """
+
+
+rule export_washington_build:
+    """
+    This part of the workflow collects the phylogenetic tree and annotations to
+    export a Nextstrain dataset.
+    This includes incorporating the lat_long.tsv annotation.
+    """
+    input:
+        tree = "results/tree.nwk",
+        metadata = "results/metadata_filtered.tsv",
+        branch_lengths = "results/branch_lengths.json",
+        traits = "results/traits.json",
+        nt_muts = "results/nt_muts.json",
+        aa_muts = "results/aa_muts.json",
+        colors = "results/colors.tsv",
+        description = config["export"]["description"],
+        lat_longs = "results/lat_longs.tsv",
+        auspice_config = config["export"]["auspice_config"],
+    output:
+        auspice = "auspice/WNV_genome.json"
+    log:
+        "logs/export.txt",
+    benchmark:
+        "benchmarks/export.txt"
+    shell:
+        """
+        augur export v2 \
+            --tree {input.tree} \
+            --metadata {input.metadata} \
+            --metadata-id-columns "accession" \
+            --node-data {input.branch_lengths} {input.traits} {input.nt_muts} {input.aa_muts} \
+            --colors {input.colors} \
+            --lat-longs {input.lat_longs} \
+            --description {input.description} \
+            --auspice-config {input.auspice_config} \
+            --output {output.auspice} 2>&1 | tee {log}
+        """
+
+# Add a Snakemake ruleorder directive here if you need to resolve ambiguous rules
+# that have the same output as the copy_example_data rule.
+ruleorder: export_washington_build > export
diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml
@@ -66,20 +66,12 @@ input_sequences: "data/sequences.fasta"
 # This command excludes all strains by default and then forces the inclusion of
 # the strains selected by the subsampling logic defined above.
 subsampling:
-  state: --query "state == 'WA'" --min-length '9800' --subsample-max-sequences 5000
-  neighboring_state: --query "state in ['CA', 'ID', 'OR', 'NV']" --group-by state year --min-length '9800' --subsample-max-sequences 5000
-  region: --query "state in ['AZ','NM', 'CO', 'UT', 'WY', 'MT']" --group-by state year --min-length '9800' --subsample-max-sequences 5000
-  country: --query "country == 'USA' and state not in ['WA', 'CA', 'ID', 'OR', 'NV','AZ','NM', 'CO', 'UT', 'WY', 'MT'] and accession != 'NC_009942'" --group-by state year --subsample-max-sequences 300 --min-length '9800'
-  force_include: --exclude-all --include ../nextclade/defaults/include.txt
-   #global: --query "country != 'USA'" --group-by country year --subsample-max-sequences 200
+  region: --query "is_lab_host != 'true'" --query-columns is_lab_host:str --min-length '9800' --group-by region year --subsample-max-sequences 3000
 
 traits:
   metadata_columns: [
-    'country',
-    'division',
-    'location',
-    'clade_membership',
-    'host'
+    'region',
+    'country'
   ]
 
 export:

diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk
@@ -34,9 +34,7 @@ rule export:
         traits = "results/traits.json",
         nt_muts = "results/nt_muts.json",
         aa_muts = "results/aa_muts.json",
-        colors = "results/colors.tsv",
         description = config["export"]["description"],
-        lat_longs = "results/lat_longs.tsv",
         auspice_config = config["export"]["auspice_config"],
     output:
         auspice = "auspice/WNV_genome.json"
@@ -51,8 +49,6 @@ rule export:
             --metadata {input.metadata} \
             --metadata-id-columns "accession" \
             --node-data {input.branch_lengths} {input.traits} {input.nt_muts} {input.aa_muts} \
-            --colors {input.colors} \
-            --lat-longs {input.lat_longs} \
             --description {input.description} \
             --auspice-config {input.auspice_config} \
             --output {output.auspice} 2>&1 | tee {log}

diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk
@@ -51,34 +51,6 @@ rule decompress:
         """
 
 
-rule create_colors:
-    input:
-        metadata = "results/metadata_filtered.tsv"
-    output:
-        colors = "results/colors.tsv"
-    log:
-            "logs/colors.txt",
-    benchmark:
-            "benchmarks/colors.txt"
-    shell:
-        """
-        python ./scripts/make_colors.py {input.metadata} {output.colors} 2>&1 | tee {log}
-        """
-
-rule create_lat_longs:
-    input:
-        metadata = "results/metadata_filtered.tsv"
-    output:
-        lat_longs = "results/lat_longs.tsv"
-    log:
-        "logs/lat_longs.txt",
-    benchmark:
-        "benchmarks/lat_longs.txt"
-    shell:
-        """
-        python ./scripts/create_lat_longs.py {input.metadata} {output.lat_longs} 2>&1 | tee {log}
-        """
-
 rule align:
     input:
         sequences = "results/sequences_filtered.fasta",