nextstrain · genehack · Jul 2, 2024 · Jul 2, 2024 · Jul 2, 2024 · Jul 2, 2024
diff --git a/CHANGES.md b/CHANGES.md
@@ -14,6 +14,7 @@
 * Added a new sub-command `augur curate abbreviate-authors` to abbreviate lists of authors to "<first author> et al." Previously, this was avaliable as the `transform-authors` script within the nextstrain/ingest repo. [#1483][] (@genehack)
 * Added a new sub-command `augur curate parse-genbank-location` to parse the `geo_loc_name` field from GenBank reconds. Previously, this was available as the `translate-genbank-location` script within the nextstrain/ingest repo. [#1485][] (@genehack)
 * curate format-dates: Added defaults to `--expected-date-formats` so that ISO 8601 dates (`%Y-%m-%d`) and its various masked forms (e.g. `%Y-XX-XX`) are automatically parsed by the command. [#1501][] (@joverlee521)
+* Added a new sub-command `augur curate translate-strain-name` to filter strain names based on matching a regular expression. Previously, this was available as the `translate-strain-names` script within the nextstrain/ingest repo. [#1486][] (@genehack)
 
 ### Bug Fixes
 

diff --git a/augur/curate/__init__.py b/augur/curate/__init__.py
@@ -12,7 +12,7 @@
 from augur.io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv
 from augur.io.sequences import write_records_to_fasta
 from augur.types import DataErrorMethod
-from . import format_dates, normalize_strings, passthru, titlecase, apply_geolocation_rules, apply_record_annotations, abbreviate_authors, parse_genbank_location
+from . import format_dates, normalize_strings, passthru, titlecase, apply_geolocation_rules, apply_record_annotations, abbreviate_authors, parse_genbank_location, transform_strain_name
 
 
 SUBCOMMAND_ATTRIBUTE = '_curate_subcommand'
@@ -25,6 +25,7 @@
     apply_record_annotations,
     abbreviate_authors,
     parse_genbank_location,
+    transform_strain_name,
 ]
 
 

diff --git a/augur/curate/transform_strain_name.py b/augur/curate/transform_strain_name.py
@@ -0,0 +1,78 @@
+"""
+Verifies strain name pattern in the 'strain' field of the NDJSON
+record. Adds a 'strain' field to the record if it does not already
+exist.
+"""
+
+import argparse
+import re
+from typing import Generator, List
+from augur.io.print import print_err
+from augur.utils import first_line
+
+
+def transform_name(
+    record: dict,
+    index: int,
+    strain_name_pattern: re.Pattern,
+    backup_fields: List[str],
+) -> dict:
+    # Verify strain name matches the strain regex pattern
+    if strain_name_pattern.match(record.get("strain", "")) is None:
+        # Default to empty string if not matching pattern
+        record["strain"] = ""
+
+        # Use non-empty value of backup fields if provided
+        if backup_fields:
+            for field in backup_fields:
+                if record.get(field):
+                    record["strain"] = str(record[field])
+                    break
+
+    if record["strain"] == "":
+        print_err(
+            f"WARNING: Record number {index} has an empty string as the strain name.",
+        )
+
+    return record
+
+
+def register_parser(
+    parent_subparsers: argparse._SubParsersAction,
+) -> argparse._SubParsersAction:
+    parser = parent_subparsers.add_parser(
+        "transform-strain-name",
+        parents=[parent_subparsers.shared_parser],  # type: ignore
+        help=first_line(__doc__),
+    )
+
+    parser.add_argument(
+        "--strain-regex",
+        default="^.+$",
+        help="Regex pattern for strain names. "
+        + "Strain names that do not match the pattern will be dropped.",
+    )
+    parser.add_argument(
+        "--backup-fields",
+        nargs="*",
+        default=[],
+        help="List of backup fields to use as strain name if the value in 'strain' "
+        + "does not match the strain regex pattern. "
+        + "If multiple fields are provided, will use the first field that has a non-empty string.",
+    )
+
+    return parser
+
+
+def run(args: argparse.Namespace, records: List[dict]) -> Generator[dict, None, None]:
+    strain_name_pattern = re.compile(args.strain_regex)
+
+    for index, record in enumerate(records):
+        transform_name(
+            record,
+            index,
+            strain_name_pattern,
+            args.backup_fields,
+        )
+
+        yield record
diff --git a/tests/functional/curate/cram/transform-strain-name/default-behavior.t b/tests/functional/curate/cram/transform-strain-name/default-behavior.t
@@ -0,0 +1,35 @@
+Setup
+
+  $ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}"
+
+Running the command with no arguments produces the expected output
+
+  $ echo '{"strain":"OC43"}' \
+  >   | ${AUGUR} curate transform-strain-name
+  {"strain": "OC43"}
+
+Providing a strain regex to the command produces the expected output when the strain matches
+
+  $ echo '{"strain":"OC43"}' \
+  >   | ${AUGUR} curate transform-strain-name --strain-regex '^\w{2}\d{2}$'
+  {"strain": "OC43"}
+
+Providing a strain regex to the command produces an empty field and a warning when the strain doesn't match
+
+  $ echo '{"strain":"OC43"}' \
+  >   | ${AUGUR} curate transform-strain-name --strain-regex '^\d{2}\w{2}$'
+  WARNING: Record number 0 has an empty string as the strain name.
+  {"strain": ""}
+
+Providing a backup field produces the expected output
+
+  $ echo '{"potential-strain":"OC43"}' \
+  >   | ${AUGUR} curate transform-strain-name --backup-fields potential-strain
+  {"potential-strain": "OC43", "strain": "OC43"}
+
+
+Multiple backup fields produce the expected output
+
+  $ echo '{"potential-strain2":"OC43"}' \
+  >   | ${AUGUR} curate transform-strain-name --backup-fields potential-strain potential-strain2
+  {"potential-strain2": "OC43", "strain": "OC43"}