Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use the latest "augur curate" commands and various other refactoring #7

Merged
merged 15 commits into from
Aug 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 21 additions & 14 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,19 +1,23 @@
# For augur repo #
# Files created by workflows that we usually want to keep out of git
auspice/
builds/
data/
results/
logs/
benchmarks/

# Sensitive environment variables
environment*
temp*
results*
data*
s3/
.snakemake
*stderr
env.d/

# Snakemake
.snakemake/

# For Python #
##############
*.pyc
.tox/
.cache/
augur.egg-info/

# Compiled source #
###################
Expand All @@ -34,11 +38,14 @@ augur.egg-info/
Icon?
ehthumbs.db
Thumbs.db

# autosave #
############
*~

# IDE #
#######
*sublime*
# IDE generated files #
######################
.vscode/

# nohup output
nohup.out

# cluster logs
slurm-*
90 changes: 27 additions & 63 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
@@ -1,75 +1,39 @@
# The workflow filepaths are written relative to this Snakefile's base directory
workdir: workflow.current_basedir

if not config:

configfile: "defaults/config.yaml"


send_slack_notifications = config.get("send_slack_notifications", False)
# Use default configuration values. Override with Snakemake's --configfile/--config options.
configfile: "defaults/config.yaml"

serotypes = ["all"]
wildcard_constraints:
serotype = "|".join(serotypes)


def _get_all_targets(wildcards):
# Default targets are the metadata TSV and sequences FASTA files
all_targets = expand(
["results/sequences_{serotype}.fasta.zst", "results/metadata_{serotype}.tsv.zst"],
serotype=serotypes,
)

# Add additional targets based on upload config
upload_config = config.get("upload", {})

for target, params in upload_config.items():
files_to_upload = params.get("files_to_upload", [])
remote_file_names = params.get("remote_file_names", [])

if len(files_to_upload) != len(remote_file_names):
print(
f"Skipping file upload for {target!r} because the number of",
"files to upload does not match the number of remote file names.",
)
elif len(remote_file_names) != len(set(remote_file_names)):
print(
f"Skipping file upload for {target!r} because there are duplicate remote file names."
)
elif not params.get("dst"):
print(
f"Skipping file upload for {target!r} because the destintion was not defined."
)
else:
all_targets.extend(
expand(
[
f"data/upload/{target}/{{file_to_upload}}-to-{{remote_file_name}}.done"
],
zip,
file_to_upload=files_to_upload,
remote_file_name=remote_file_names,
)
)

# Add additional targets for Nextstrain's internal Slack notifications
if send_slack_notifications:
all_targets.extend(
[
"data/notify/genbank-record-change.done",
"data/notify/metadata-diff.done",
]
)

if config.get("trigger_rebuild"):
all_targets.append("data/trigger/rebuild.done")

return all_targets


# This is the default rule that Snakemake will run when there are no specified targets.
# The default output of the ingest workflow is usually the curated metadata and sequences.
rule all:
input:
_get_all_targets,

sequences=expand("results/sequences_{serotype}.fasta", serotype=serotypes),
metadata=expand("results/metadata_{serotype}.tsv", serotype=serotypes),

# Include smk files that contain the core steps necessary for building the curated metadata and sequence files.
# If there are build-specific customizations, they should be added with the
# custom_rules imported below to ensure that the core workflow is not complicated
# by build-specific rules.
include: "rules/fetch_from_ncbi.smk"
include: "rules/curate.smk"
include: "rules/nextclade.smk"

# Allow users to import custom rules provided via the config.
# This allows users to run custom rules that can extend or override the workflow.
# A concrete example of using custom rules is the extension of the workflow with
# rules to support the Nextstrain automation that uploads files and sends internal
# Slack notifications.
# For extensions, the user will have to specify the custom rule targets when
# running the workflow.
# For overrides, the custom Snakefile will have to use the `ruleorder` directive
# to allow Snakemake to handle ambiguous rules
# https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#handling-ambiguous-rules
if "custom_rules" in config:
for rule_file in config["custom_rules"]:

include: rule_file
2 changes: 0 additions & 2 deletions ingest/benchmarks/extract_ncbi_dataset_sequences.txt

This file was deleted.

2 changes: 0 additions & 2 deletions ingest/benchmarks/fetch_ncbi_dataset_package.txt

This file was deleted.

2 changes: 0 additions & 2 deletions ingest/benchmarks/format_ncbi_dataset_report.txt

This file was deleted.

2 changes: 0 additions & 2 deletions ingest/benchmarks/format_ncbi_datasets_ndjson.txt

This file was deleted.

135 changes: 93 additions & 42 deletions ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
@@ -1,54 +1,101 @@
# Sources of sequences to include in the ingest run
sources: ['genbank']
# This configuration file should contain all required configuration parameters
# for the ingest workflow to run to completion.
#
# Define optional config parameters with their default values here so that users
# do not have to dig through the workflows to figure out the default values

# Params for the curate rule
# Required to fetch from NCBI Datasets
ncbi_taxon_id: "11082"

# The list of NCBI Datasets fields to include from NCBI Datasets output
# These need to be the "mnemonics" of the NCBI Datasets fields, see docs for full list of fields
# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
# Note: the "accession" field MUST be provided to match with the sequences
ncbi_datasets_fields:
- accession
- sourcedb
- isolate-lineage
- geo-region
- geo-location
- isolate-collection-date
- release-date
- update-date
- length
- host-name
- isolate-lineage-source
- bioprojects
- biosample-acc
- sra-accs
- submitter-names
- submitter-affiliation

# Config parameters related to the curate pipeline
curate:
# Fields to rename.
# This is the first step in the pipeline, so any references to field names
# in the configs below should use the new field names
field_map: ['collected=date', 'submitted=date_submitted', 'genbank_accession=accession', 'submitting_organization=institution']
# URL pointed to public generalized geolocation rules
# For the Nextstrain team, this is currently
# "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv"
geolocation_rules_url: "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv"
# The path to the local geolocation rules within the pathogen repo
# The path should be relative to the ingest directory.
local_geolocation_rules: "defaults/geolocation-rules.tsv"
# The original field names should match the ncbi_datasets_fields provided above.
# This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
field_map:
accession: accession
accession_version: accession_version
sourcedb: database
isolate-lineage: strain
geo-region: region
geo-location: location
isolate-collection-date: date
release-date: date_released
update-date: date_updated
length: length
host-name: host
isolate-lineage-source: sample_type
biosample-acc: biosample_accessions
sra-accs: sra_accessions
submitter-names: authors
submitter-affiliation: institution
# Standardized strain name regex
# Currently accepts any characters because we do not have a clear standard for strain names
strain_regex: '^.+$'
# Back up strain name field if 'strain' doesn't match regex above
strain_backup_fields: ['strain_s', 'accession']
# List of date fields to standardize
date_fields: ['date', 'date_submitted', 'updated']
# Expected date formats present in date fields
# Currently accepts any characters because we do not have a clear standard for strain names across pathogens
strain_regex: "^.+$"
# Back up strain name field to use if "strain" doesn"t match regex above
strain_backup_fields: ["accession"]
# List of date fields to standardize to ISO format YYYY-MM-DD
date_fields: ["date", "date_released", "date_updated"]
# List of expected date formats that are present in the date fields provided above
# These date formats should use directives expected by datetime
# See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
expected_date_formats: ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ']
# Titlecase rules
expected_date_formats: ["%Y", "%Y-%m", "%Y-%m-%d", "%Y-%m-%dT%H:%M:%SZ"]
# The expected field that contains the GenBank geo_loc_name
genbank_location_field: location
titlecase:
# Abbreviations not cast to titlecase, keeps uppercase
abbreviations: ['USA']
# List of string fields to titlecase
fields: ["region", "country", "division", "location"]
# List of abbreviations not cast to titlecase, keeps uppercase
abbreviations: ["USA"]
# Articles that should not be cast to titlecase
articles: [
'and', 'd', 'de', 'del', 'des', 'di', 'do', 'en', 'l', 'la', 'las', 'le',
'los', 'nad', 'of', 'op', 'sur', 'the', 'y'
"and", "d", "de", "del", "des", "di", "do", "en", "l", "la", "las", "le",
"los", "nad", "of", "op", "sur", "the", "y"
]
# List of string fields to titlecase
fields: ['region', 'country', 'division', 'location']
# Authors field name
authors_field: 'authors'
# Authors default value if authors value is empty
authors_default_value: '?'
# Field name for the generated abbreviated authors
abbr_authors_field: 'abbr_authors'
# General geolocation rules to apply to geolocation fields
geolocation_rules_url: 'https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv'
# Local geolocation rules that are only applicable to dengue data
# Local rules can overwrite the general geolocation rules provided above
local_geolocation_rules: 'defaults/geolocation-rules.tsv'
# User annotations file
annotations: 'defaults/annotations.tsv'
# ID field used to merge annotations
annotations_id: 'accession'
# Field to use as the sequence ID in the FASTA file
id_field: 'accession'
# Field to use as the sequence in the FASTA file
sequence_field: 'sequence'
# Final output columns for the metadata TSV
# Metadata field that contains the list of authors associated with the sequence
authors_field: "authors"
# Default value to use if the authors field is empty
authors_default_value: "?"
# Name to use for the generated abbreviated authors field
abbr_authors_field: "abbr_authors"
# Path to the manual annotations file
# The path should be relative to the ingest directory
annotations: "defaults/annotations.tsv"
# The ID field in the metadata to use to merge the manual annotations
annotations_id: "accession"
# The ID field in the metadata to use as the sequence id in the output FASTA file
output_id_field: "accession"
# The field in the NDJSON record that contains the actual genomic sequence
output_sequence_field: "sequence"
# The list of metadata columns to keep in the final output of the curation pipeline.
metadata_columns: [
'accession',
#'genbank_accession_rev',
Expand Down Expand Up @@ -78,3 +125,7 @@ curate:
'url',
'length',
]

nextclade:
nextclade_dataset_path: '../nextclade/dataset'
nextclade_field: 'clade_membership'
1 change: 0 additions & 1 deletion ingest/logs/curate_all.txt

This file was deleted.

Empty file.
Loading