Skip to content

Commit

Permalink
Use the latest "augur curate" commands and various other refactoring #7
Browse files Browse the repository at this point in the history
  • Loading branch information
j23414 authored Aug 16, 2024
2 parents 3060a7f + 6829bad commit 51856f9
Show file tree
Hide file tree
Showing 33 changed files with 332 additions and 1,548 deletions.
35 changes: 21 additions & 14 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,19 +1,23 @@
# For augur repo #
# Files created by workflows that we usually want to keep out of git
auspice/
builds/
data/
results/
logs/
benchmarks/

# Sensitive environment variables
environment*
temp*
results*
data*
s3/
.snakemake
*stderr
env.d/

# Snakemake
.snakemake/

# For Python #
##############
*.pyc
.tox/
.cache/
augur.egg-info/

# Compiled source #
###################
Expand All @@ -34,11 +38,14 @@ augur.egg-info/
Icon?
ehthumbs.db
Thumbs.db

# autosave #
############
*~

# IDE #
#######
*sublime*
# IDE generated files #
######################
.vscode/

# nohup output
nohup.out

# cluster logs
slurm-*
90 changes: 27 additions & 63 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
@@ -1,75 +1,39 @@
# The workflow filepaths are written relative to this Snakefile's base directory
workdir: workflow.current_basedir

if not config:

configfile: "defaults/config.yaml"


send_slack_notifications = config.get("send_slack_notifications", False)
# Use default configuration values. Override with Snakemake's --configfile/--config options.
configfile: "defaults/config.yaml"

serotypes = ["all"]
wildcard_constraints:
serotype = "|".join(serotypes)


def _get_all_targets(wildcards):
# Default targets are the metadata TSV and sequences FASTA files
all_targets = expand(
["results/sequences_{serotype}.fasta.zst", "results/metadata_{serotype}.tsv.zst"],
serotype=serotypes,
)

# Add additional targets based on upload config
upload_config = config.get("upload", {})

for target, params in upload_config.items():
files_to_upload = params.get("files_to_upload", [])
remote_file_names = params.get("remote_file_names", [])

if len(files_to_upload) != len(remote_file_names):
print(
f"Skipping file upload for {target!r} because the number of",
"files to upload does not match the number of remote file names.",
)
elif len(remote_file_names) != len(set(remote_file_names)):
print(
f"Skipping file upload for {target!r} because there are duplicate remote file names."
)
elif not params.get("dst"):
print(
f"Skipping file upload for {target!r} because the destintion was not defined."
)
else:
all_targets.extend(
expand(
[
f"data/upload/{target}/{{file_to_upload}}-to-{{remote_file_name}}.done"
],
zip,
file_to_upload=files_to_upload,
remote_file_name=remote_file_names,
)
)

# Add additional targets for Nextstrain's internal Slack notifications
if send_slack_notifications:
all_targets.extend(
[
"data/notify/genbank-record-change.done",
"data/notify/metadata-diff.done",
]
)

if config.get("trigger_rebuild"):
all_targets.append("data/trigger/rebuild.done")

return all_targets


# This is the default rule that Snakemake will run when there are no specified targets.
# The default output of the ingest workflow is usually the curated metadata and sequences.
rule all:
input:
_get_all_targets,

sequences=expand("results/sequences_{serotype}.fasta", serotype=serotypes),
metadata=expand("results/metadata_{serotype}.tsv", serotype=serotypes),

# Include smk files that contain the core steps necessary for building the curated metadata and sequence files.
# If there are build-specific customizations, they should be added with the
# custom_rules imported below to ensure that the core workflow is not complicated
# by build-specific rules.
include: "rules/fetch_from_ncbi.smk"
include: "rules/curate.smk"
include: "rules/nextclade.smk"

# Allow users to import custom rules provided via the config.
# This allows users to run custom rules that can extend or override the workflow.
# A concrete example of using custom rules is the extension of the workflow with
# rules to support the Nextstrain automation that uploads files and sends internal
# Slack notifications.
# For extensions, the user will have to specify the custom rule targets when
# running the workflow.
# For overrides, the custom Snakefile will have to use the `ruleorder` directive
# to allow Snakemake to handle ambiguous rules
# https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#handling-ambiguous-rules
if "custom_rules" in config:
for rule_file in config["custom_rules"]:

include: rule_file
2 changes: 0 additions & 2 deletions ingest/benchmarks/extract_ncbi_dataset_sequences.txt

This file was deleted.

2 changes: 0 additions & 2 deletions ingest/benchmarks/fetch_ncbi_dataset_package.txt

This file was deleted.

2 changes: 0 additions & 2 deletions ingest/benchmarks/format_ncbi_dataset_report.txt

This file was deleted.

2 changes: 0 additions & 2 deletions ingest/benchmarks/format_ncbi_datasets_ndjson.txt

This file was deleted.

135 changes: 93 additions & 42 deletions ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
@@ -1,54 +1,101 @@
# Sources of sequences to include in the ingest run
sources: ['genbank']
# This configuration file should contain all required configuration parameters
# for the ingest workflow to run to completion.
#
# Define optional config parameters with their default values here so that users
# do not have to dig through the workflows to figure out the default values

# Params for the curate rule
# Required to fetch from NCBI Datasets
ncbi_taxon_id: "11082"

# The list of NCBI Datasets fields to include from NCBI Datasets output
# These need to be the "mnemonics" of the NCBI Datasets fields, see docs for full list of fields
# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
# Note: the "accession" field MUST be provided to match with the sequences
ncbi_datasets_fields:
- accession
- sourcedb
- isolate-lineage
- geo-region
- geo-location
- isolate-collection-date
- release-date
- update-date
- length
- host-name
- isolate-lineage-source
- bioprojects
- biosample-acc
- sra-accs
- submitter-names
- submitter-affiliation

# Config parameters related to the curate pipeline
curate:
# Fields to rename.
# This is the first step in the pipeline, so any references to field names
# in the configs below should use the new field names
field_map: ['collected=date', 'submitted=date_submitted', 'genbank_accession=accession', 'submitting_organization=institution']
# URL pointed to public generalized geolocation rules
# For the Nextstrain team, this is currently
# "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv"
geolocation_rules_url: "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv"
# The path to the local geolocation rules within the pathogen repo
# The path should be relative to the ingest directory.
local_geolocation_rules: "defaults/geolocation-rules.tsv"
# The original field names should match the ncbi_datasets_fields provided above.
# This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
field_map:
accession: accession
accession_version: accession_version
sourcedb: database
isolate-lineage: strain
geo-region: region
geo-location: location
isolate-collection-date: date
release-date: date_released
update-date: date_updated
length: length
host-name: host
isolate-lineage-source: sample_type
biosample-acc: biosample_accessions
sra-accs: sra_accessions
submitter-names: authors
submitter-affiliation: institution
# Standardized strain name regex
# Currently accepts any characters because we do not have a clear standard for strain names
strain_regex: '^.+$'
# Back up strain name field if 'strain' doesn't match regex above
strain_backup_fields: ['strain_s', 'accession']
# List of date fields to standardize
date_fields: ['date', 'date_submitted', 'updated']
# Expected date formats present in date fields
# Currently accepts any characters because we do not have a clear standard for strain names across pathogens
strain_regex: "^.+$"
# Back up strain name field to use if "strain" doesn"t match regex above
strain_backup_fields: ["accession"]
# List of date fields to standardize to ISO format YYYY-MM-DD
date_fields: ["date", "date_released", "date_updated"]
# List of expected date formats that are present in the date fields provided above
# These date formats should use directives expected by datetime
# See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
expected_date_formats: ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ']
# Titlecase rules
expected_date_formats: ["%Y", "%Y-%m", "%Y-%m-%d", "%Y-%m-%dT%H:%M:%SZ"]
# The expected field that contains the GenBank geo_loc_name
genbank_location_field: location
titlecase:
# Abbreviations not cast to titlecase, keeps uppercase
abbreviations: ['USA']
# List of string fields to titlecase
fields: ["region", "country", "division", "location"]
# List of abbreviations not cast to titlecase, keeps uppercase
abbreviations: ["USA"]
# Articles that should not be cast to titlecase
articles: [
'and', 'd', 'de', 'del', 'des', 'di', 'do', 'en', 'l', 'la', 'las', 'le',
'los', 'nad', 'of', 'op', 'sur', 'the', 'y'
"and", "d", "de", "del", "des", "di", "do", "en", "l", "la", "las", "le",
"los", "nad", "of", "op", "sur", "the", "y"
]
# List of string fields to titlecase
fields: ['region', 'country', 'division', 'location']
# Authors field name
authors_field: 'authors'
# Authors default value if authors value is empty
authors_default_value: '?'
# Field name for the generated abbreviated authors
abbr_authors_field: 'abbr_authors'
# General geolocation rules to apply to geolocation fields
geolocation_rules_url: 'https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv'
# Local geolocation rules that are only applicable to dengue data
# Local rules can overwrite the general geolocation rules provided above
local_geolocation_rules: 'defaults/geolocation-rules.tsv'
# User annotations file
annotations: 'defaults/annotations.tsv'
# ID field used to merge annotations
annotations_id: 'accession'
# Field to use as the sequence ID in the FASTA file
id_field: 'accession'
# Field to use as the sequence in the FASTA file
sequence_field: 'sequence'
# Final output columns for the metadata TSV
# Metadata field that contains the list of authors associated with the sequence
authors_field: "authors"
# Default value to use if the authors field is empty
authors_default_value: "?"
# Name to use for the generated abbreviated authors field
abbr_authors_field: "abbr_authors"
# Path to the manual annotations file
# The path should be relative to the ingest directory
annotations: "defaults/annotations.tsv"
# The ID field in the metadata to use to merge the manual annotations
annotations_id: "accession"
# The ID field in the metadata to use as the sequence id in the output FASTA file
output_id_field: "accession"
# The field in the NDJSON record that contains the actual genomic sequence
output_sequence_field: "sequence"
# The list of metadata columns to keep in the final output of the curation pipeline.
metadata_columns: [
'accession',
#'genbank_accession_rev',
Expand Down Expand Up @@ -78,3 +125,7 @@ curate:
'url',
'length',
]

nextclade:
nextclade_dataset_path: '../nextclade/dataset'
nextclade_field: 'clade_membership'
1 change: 0 additions & 1 deletion ingest/logs/curate_all.txt

This file was deleted.

Empty file.
Loading

0 comments on commit 51856f9

Please sign in to comment.