Use the latest "augur curate" commands and various other refactoring #7

nextstrain · Aug 16, 2024 · 51856f9 · 51856f9
2 parents 3060a7f + 6829bad
commit 51856f9
Show file tree

Hide file tree

Showing 33 changed files with 332 additions and 1,548 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,19 +1,23 @@
-# For augur repo #
+# Files created by workflows that we usually want to keep out of git
+auspice/
+builds/
+data/
+results/
+logs/
+benchmarks/
+
+# Sensitive environment variables
 environment*
-temp*
-results*
-data*
-s3/
-.snakemake
-*stderr
+env.d/
 
+# Snakemake
+.snakemake/
 
 # For Python #
 ##############
 *.pyc
 .tox/
 .cache/
-augur.egg-info/
 
 # Compiled source #
 ###################
@@ -34,11 +38,14 @@ augur.egg-info/
 Icon?
 ehthumbs.db
 Thumbs.db
-
-# autosave #
-############
 *~
 
-# IDE #
-#######
-*sublime*
+# IDE generated files #
+######################
+.vscode/
+
+# nohup output
+nohup.out
+
+# cluster logs
+slurm-*
diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -1,75 +1,39 @@
 # The workflow filepaths are written relative to this Snakefile's base directory
 workdir: workflow.current_basedir
 
-if not config:
-
-    configfile: "defaults/config.yaml"
-
-
-send_slack_notifications = config.get("send_slack_notifications", False)
+# Use default configuration values. Override with Snakemake's --configfile/--config options.
+configfile: "defaults/config.yaml"
 
 serotypes = ["all"]
+wildcard_constraints:
+    serotype = "|".join(serotypes)
 
-
-def _get_all_targets(wildcards):
-    # Default targets are the metadata TSV and sequences FASTA files
-    all_targets = expand(
-        ["results/sequences_{serotype}.fasta.zst", "results/metadata_{serotype}.tsv.zst"],
-        serotype=serotypes,
-    )
-
-    # Add additional targets based on upload config
-    upload_config = config.get("upload", {})
-
-    for target, params in upload_config.items():
-        files_to_upload = params.get("files_to_upload", [])
-        remote_file_names = params.get("remote_file_names", [])
-
-        if len(files_to_upload) != len(remote_file_names):
-            print(
-                f"Skipping file upload for {target!r} because the number of",
-                "files to upload does not match the number of remote file names.",
-            )
-        elif len(remote_file_names) != len(set(remote_file_names)):
-            print(
-                f"Skipping file upload for {target!r} because there are duplicate remote file names."
-            )
-        elif not params.get("dst"):
-            print(
-                f"Skipping file upload for {target!r} because the destintion was not defined."
-            )
-        else:
-            all_targets.extend(
-                expand(
-                    [
-                        f"data/upload/{target}/{{file_to_upload}}-to-{{remote_file_name}}.done"
-                    ],
-                    zip,
-                    file_to_upload=files_to_upload,
-                    remote_file_name=remote_file_names,
-                )
-            )
-
-    # Add additional targets for Nextstrain's internal Slack notifications
-    if send_slack_notifications:
-        all_targets.extend(
-            [
-                "data/notify/genbank-record-change.done",
-                "data/notify/metadata-diff.done",
-            ]
-        )
-
-    if config.get("trigger_rebuild"):
-        all_targets.append("data/trigger/rebuild.done")
-
-    return all_targets
-
-
+# This is the default rule that Snakemake will run when there are no specified targets.
+# The default output of the ingest workflow is usually the curated metadata and sequences.
 rule all:
     input:
-        _get_all_targets,
-
+        sequences=expand("results/sequences_{serotype}.fasta", serotype=serotypes),
+        metadata=expand("results/metadata_{serotype}.tsv", serotype=serotypes),
 
+# Include smk files that contain the core steps necessary for building the curated metadata and sequence files.
+# If there are build-specific customizations, they should be added with the
+# custom_rules imported below to ensure that the core workflow is not complicated
+# by build-specific rules.
 include: "rules/fetch_from_ncbi.smk"
 include: "rules/curate.smk"
 include: "rules/nextclade.smk"
+
+# Allow users to import custom rules provided via the config.
+# This allows users to run custom rules that can extend or override the workflow.
+# A concrete example of using custom rules is the extension of the workflow with
+# rules to support the Nextstrain automation that uploads files and sends internal
+# Slack notifications.
+# For extensions, the user will have to specify the custom rule targets when
+# running the workflow.
+# For overrides, the custom Snakefile will have to use the `ruleorder` directive
+# to allow Snakemake to handle ambiguous rules
+# https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#handling-ambiguous-rules
+if "custom_rules" in config:
+    for rule_file in config["custom_rules"]:
+
+        include: rule_file
diff --git a/ingest/benchmarks/extract_ncbi_dataset_sequences.txt b/ingest/benchmarks/extract_ncbi_dataset_sequences.txt
diff --git a/ingest/benchmarks/fetch_ncbi_dataset_package.txt b/ingest/benchmarks/fetch_ncbi_dataset_package.txt
diff --git a/ingest/benchmarks/format_ncbi_dataset_report.txt b/ingest/benchmarks/format_ncbi_dataset_report.txt
diff --git a/ingest/benchmarks/format_ncbi_datasets_ndjson.txt b/ingest/benchmarks/format_ncbi_datasets_ndjson.txt
diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
@@ -1,54 +1,101 @@
-# Sources of sequences to include in the ingest run
-sources: ['genbank']
+# This configuration file should contain all required configuration parameters
+# for the ingest workflow to run to completion.
+#
+# Define optional config parameters with their default values here so that users
+# do not have to dig through the workflows to figure out the default values
 
-# Params for the curate rule
+# Required to fetch from NCBI Datasets
+ncbi_taxon_id: "11082"
+
+# The list of NCBI Datasets fields to include from NCBI Datasets output
+# These need to be the "mnemonics" of the NCBI Datasets fields, see docs for full list of fields
+# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
+# Note: the "accession" field MUST be provided to match with the sequences
+ncbi_datasets_fields:
+  - accession
+  - sourcedb
+  - isolate-lineage
+  - geo-region
+  - geo-location
+  - isolate-collection-date
+  - release-date
+  - update-date
+  - length
+  - host-name
+  - isolate-lineage-source
+  - bioprojects
+  - biosample-acc
+  - sra-accs
+  - submitter-names
+  - submitter-affiliation
+
+# Config parameters related to the curate pipeline
 curate:
-  # Fields to rename.
-  # This is the first step in the pipeline, so any references to field names
-  # in the configs below should use the new field names
-  field_map: ['collected=date', 'submitted=date_submitted', 'genbank_accession=accession', 'submitting_organization=institution']
+  # URL pointed to public generalized geolocation rules
+  # For the Nextstrain team, this is currently
+  # "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv"
+  geolocation_rules_url: "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv"
+  # The path to the local geolocation rules within the pathogen repo
+  # The path should be relative to the ingest directory.
+  local_geolocation_rules: "defaults/geolocation-rules.tsv"
+  # The original field names should match the ncbi_datasets_fields provided above.
+  # This is the first step in the pipeline, so any references to field names in the configs below should use the new field names
+  field_map:
+    accession: accession
+    accession_version: accession_version
+    sourcedb: database
+    isolate-lineage: strain
+    geo-region: region
+    geo-location: location
+    isolate-collection-date: date
+    release-date: date_released
+    update-date: date_updated
+    length: length
+    host-name: host
+    isolate-lineage-source: sample_type
+    biosample-acc: biosample_accessions
+    sra-accs: sra_accessions
+    submitter-names: authors
+    submitter-affiliation: institution
   # Standardized strain name regex
-  # Currently accepts any characters because we do not have a clear standard for strain names
-  strain_regex: '^.+$'
-  # Back up strain name field if 'strain' doesn't match regex above
-  strain_backup_fields: ['strain_s', 'accession']
-  # List of date fields to standardize
-  date_fields: ['date', 'date_submitted', 'updated']
-  # Expected date formats present in date fields
+  # Currently accepts any characters because we do not have a clear standard for strain names across pathogens
+  strain_regex: "^.+$"
+  # Back up strain name field to use if "strain" doesn"t match regex above
+  strain_backup_fields: ["accession"]
+  # List of date fields to standardize to ISO format YYYY-MM-DD
+  date_fields: ["date", "date_released", "date_updated"]
+  # List of expected date formats that are present in the date fields provided above
   # These date formats should use directives expected by datetime
   # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
-  expected_date_formats: ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ']
-  # Titlecase rules
+  expected_date_formats: ["%Y", "%Y-%m", "%Y-%m-%d", "%Y-%m-%dT%H:%M:%SZ"]
+  # The expected field that contains the GenBank geo_loc_name
+  genbank_location_field: location
   titlecase:
-    # Abbreviations not cast to titlecase, keeps uppercase
-    abbreviations: ['USA']
+    # List of string fields to titlecase
+    fields: ["region", "country", "division", "location"]
+    # List of abbreviations not cast to titlecase, keeps uppercase
+    abbreviations: ["USA"]
     # Articles that should not be cast to titlecase
     articles: [
-      'and', 'd', 'de', 'del', 'des', 'di', 'do', 'en', 'l', 'la', 'las', 'le',
-      'los', 'nad', 'of', 'op', 'sur', 'the', 'y'
+      "and", "d", "de", "del", "des", "di", "do", "en", "l", "la", "las", "le",
+      "los", "nad", "of", "op", "sur", "the", "y"
     ]
-    # List of string fields to titlecase
-    fields: ['region', 'country', 'division', 'location']
-  # Authors field name
-  authors_field: 'authors'
-  # Authors default value if authors value is empty
-  authors_default_value: '?'
-  # Field name for the generated abbreviated authors
-  abbr_authors_field: 'abbr_authors'
-  # General geolocation rules to apply to geolocation fields
-  geolocation_rules_url: 'https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv'
-  # Local geolocation rules that are only applicable to dengue data
-  # Local rules can overwrite the general geolocation rules provided above
-  local_geolocation_rules: 'defaults/geolocation-rules.tsv'
-  # User annotations file
-  annotations: 'defaults/annotations.tsv'
-  # ID field used to merge annotations
-  annotations_id: 'accession'
-  # Field to use as the sequence ID in the FASTA file
-  id_field: 'accession'
-  # Field to use as the sequence in the FASTA file
-  sequence_field: 'sequence'
-  # Final output columns for the metadata TSV
+  # Metadata field that contains the list of authors associated with the sequence
+  authors_field: "authors"
+  # Default value to use if the authors field is empty
+  authors_default_value: "?"
+  # Name to use for the generated abbreviated authors field
+  abbr_authors_field: "abbr_authors"
+  # Path to the manual annotations file
+  # The path should be relative to the ingest directory
+  annotations: "defaults/annotations.tsv"
+  # The ID field in the metadata to use to merge the manual annotations
+  annotations_id: "accession"
+  # The ID field in the metadata to use as the sequence id in the output FASTA file
+  output_id_field: "accession"
+  # The field in the NDJSON record that contains the actual genomic sequence
+  output_sequence_field: "sequence"
+  # The list of metadata columns to keep in the final output of the curation pipeline.
   metadata_columns: [
     'accession',
     #'genbank_accession_rev',
@@ -78,3 +125,7 @@ curate:
     'url',
     'length',
   ]
+
+nextclade:
+  nextclade_dataset_path: '../nextclade/dataset'
+  nextclade_field: 'clade_membership'
diff --git a/ingest/logs/curate_all.txt b/ingest/logs/curate_all.txt
diff --git a/ingest/logs/format_ncbi_datasets_ndjson.txt b/ingest/logs/format_ncbi_datasets_ndjson.txt