Skip to content

Commit

Permalink
Initial separation of washington state specific configs and rules
Browse files Browse the repository at this point in the history
This separates the washginton-specific rules and configs from the new global workflow.
Subsequent commits will tune global rules and configs, and rooting.

The washginton-specific workflow can run via:

nextstrain build phylogenetic --configfile build-configs/washington-state/config.yaml
  • Loading branch information
j23414 committed Oct 17, 2024
1 parent 0e0714c commit 39d468d
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 66 deletions.
38 changes: 32 additions & 6 deletions phylogenetic/build-configs/washington-state/config.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,37 @@
# This configuration file contains the custom configurations parameters
# for the CI workflow to run with the example data.
# for the Washington State phylogenetic build with custom rules and metadata

# Pull in metadata and sequences from the example_data directory
input_metadata: "example_data/metadata.tsv"
input_sequences: "example_data/sequences.fasta"
# Use 'NY99' as the reference since it should be basel to the USA sequences
reference: "defaults/reference.gb"
# Use 'IS88' as the root strain on the phylogenetic tree to place samples within the global context
root: "AF481864"

# Pull in metadata and sequences from the ingest directory after it has been annotated with washington-state specific metadata
input_metadata: "../ingest/results/metadata.tsv"
input_sequences: "../ingest/results/sequences.fasta"

# This command excludes all strains by default and then forces the inclusion of
# the strains selected by the subsampling logic defined above.
subsampling:
state: --query "state == 'WA'" --min-length '9800' --subsample-max-sequences 5000
neighboring_state: --query "state in ['CA', 'ID', 'OR', 'NV']" --group-by state year --min-length '9800' --subsample-max-sequences 5000
region: --query "state in ['AZ','NM', 'CO', 'UT', 'WY', 'MT']" --group-by state year --min-length '9800' --subsample-max-sequences 5000
country: --query "country == 'USA' and state not in ['WA', 'CA', 'ID', 'OR', 'NV','AZ','NM', 'CO', 'UT', 'WY', 'MT'] and accession != 'NC_009942'" --group-by state year --subsample-max-sequences 300 --min-length '9800'
force_include: --exclude-all --include ../nextclade/defaults/include.txt

traits:
metadata_columns: [
'country',
'division',
'location',
'clade_membership',
'host'
]

export:
auspice_config: "defaults/auspice_config.json"

## Custom rules to run as part of the CI automated workflow
## The paths should be relative to the phylogenetic directory.
#custom_rules:
# - build-configs/ci/copy_example_data.smk
custom_rules:
- build-configs/washington-state/washington-state-rules.smk
17 changes: 0 additions & 17 deletions phylogenetic/build-configs/washington-state/copy_example_data.smk

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""
These are washington specific rules for the phylogenetic workflow.
"""

rule create_lat_longs:
"""
This rule creates an averaged lat_longs.tsv file from the metadata_filtered.tsv file, but this requires a USA state annotation. This rule fails on global datasets.
"""
input:
metadata = "results/metadata_filtered.tsv"
output:
lat_longs = "results/lat_longs.tsv"
log:
"logs/lat_longs.txt",
benchmark:
"benchmarks/lat_longs.txt"
shell:
"""
python ./scripts/create_lat_longs.py {input.metadata} {output.lat_longs} 2>&1 | tee {log}
"""


rule create_colors:
input:
metadata = "results/metadata_filtered.tsv"
output:
colors = "results/colors.tsv"
log:
"logs/colors.txt",
benchmark:
"benchmarks/colors.txt"
shell:
"""
python ./scripts/make_colors.py {input.metadata} {output.colors} 2>&1 | tee {log}
"""


rule export_washington_build:
"""
This part of the workflow collects the phylogenetic tree and annotations to
export a Nextstrain dataset.
This includes incorporating the lat_long.tsv annotation.
"""
input:
tree = "results/tree.nwk",
metadata = "results/metadata_filtered.tsv",
branch_lengths = "results/branch_lengths.json",
traits = "results/traits.json",
nt_muts = "results/nt_muts.json",
aa_muts = "results/aa_muts.json",
colors = "results/colors.tsv",
description = config["export"]["description"],
lat_longs = "results/lat_longs.tsv",
auspice_config = config["export"]["auspice_config"],
output:
auspice = "auspice/WNV_genome.json"
log:
"logs/export.txt",
benchmark:
"benchmarks/export.txt"
shell:
"""
augur export v2 \
--tree {input.tree} \
--metadata {input.metadata} \
--metadata-id-columns "accession" \
--node-data {input.branch_lengths} {input.traits} {input.nt_muts} {input.aa_muts} \
--colors {input.colors} \
--lat-longs {input.lat_longs} \
--description {input.description} \
--auspice-config {input.auspice_config} \
--output {output.auspice} 2>&1 | tee {log}
"""

# Add a Snakemake ruleorder directive here if you need to resolve ambiguous rules
# that have the same output as the copy_example_data rule.
ruleorder: export_washington_build > export
14 changes: 3 additions & 11 deletions phylogenetic/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,20 +66,12 @@ input_sequences: "data/sequences.fasta"
# This command excludes all strains by default and then forces the inclusion of
# the strains selected by the subsampling logic defined above.
subsampling:
state: --query "state == 'WA'" --min-length '9800' --subsample-max-sequences 5000
neighboring_state: --query "state in ['CA', 'ID', 'OR', 'NV']" --group-by state year --min-length '9800' --subsample-max-sequences 5000
region: --query "state in ['AZ','NM', 'CO', 'UT', 'WY', 'MT']" --group-by state year --min-length '9800' --subsample-max-sequences 5000
country: --query "country == 'USA' and state not in ['WA', 'CA', 'ID', 'OR', 'NV','AZ','NM', 'CO', 'UT', 'WY', 'MT'] and accession != 'NC_009942'" --group-by state year --subsample-max-sequences 300 --min-length '9800'
force_include: --exclude-all --include ../nextclade/defaults/include.txt
#global: --query "country != 'USA'" --group-by country year --subsample-max-sequences 200
region: --query "is_lab_host != 'true'" --query-columns is_lab_host:str --min-length '9800' --group-by region year --subsample-max-sequences 3000

traits:
metadata_columns: [
'country',
'division',
'location',
'clade_membership',
'host'
'region',
'country'
]

export:
Expand Down
4 changes: 0 additions & 4 deletions phylogenetic/rules/export.smk
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,7 @@ rule export:
traits = "results/traits.json",
nt_muts = "results/nt_muts.json",
aa_muts = "results/aa_muts.json",
colors = "results/colors.tsv",
description = config["export"]["description"],
lat_longs = "results/lat_longs.tsv",
auspice_config = config["export"]["auspice_config"],
output:
auspice = "auspice/WNV_genome.json"
Expand All @@ -51,8 +49,6 @@ rule export:
--metadata {input.metadata} \
--metadata-id-columns "accession" \
--node-data {input.branch_lengths} {input.traits} {input.nt_muts} {input.aa_muts} \
--colors {input.colors} \
--lat-longs {input.lat_longs} \
--description {input.description} \
--auspice-config {input.auspice_config} \
--output {output.auspice} 2>&1 | tee {log}
Expand Down
28 changes: 0 additions & 28 deletions phylogenetic/rules/prepare_sequences.smk
Original file line number Diff line number Diff line change
Expand Up @@ -51,34 +51,6 @@ rule decompress:
"""


rule create_colors:
input:
metadata = "results/metadata_filtered.tsv"
output:
colors = "results/colors.tsv"
log:
"logs/colors.txt",
benchmark:
"benchmarks/colors.txt"
shell:
"""
python ./scripts/make_colors.py {input.metadata} {output.colors} 2>&1 | tee {log}
"""

rule create_lat_longs:
input:
metadata = "results/metadata_filtered.tsv"
output:
lat_longs = "results/lat_longs.tsv"
log:
"logs/lat_longs.txt",
benchmark:
"benchmarks/lat_longs.txt"
shell:
"""
python ./scripts/create_lat_longs.py {input.metadata} {output.lat_longs} 2>&1 | tee {log}
"""

rule align:
input:
sequences = "results/sequences_filtered.fasta",
Expand Down

0 comments on commit 39d468d

Please sign in to comment.