diff --git a/ingest/vendored/.cramrc b/ingest/vendored/.cramrc deleted file mode 100644 index 153d20f..0000000 --- a/ingest/vendored/.cramrc +++ /dev/null @@ -1,3 +0,0 @@ -[cram] -shell = /bin/bash -indent = 2 diff --git a/ingest/vendored/.github/workflows/ci.yaml b/ingest/vendored/.github/workflows/ci.yaml index 4d445ec..c716277 100644 --- a/ingest/vendored/.github/workflows/ci.yaml +++ b/ingest/vendored/.github/workflows/ci.yaml @@ -13,11 +13,3 @@ jobs: steps: - uses: actions/checkout@v4 - uses: nextstrain/.github/actions/shellcheck@master - - cram: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - - run: pip install cram - - run: cram tests/ diff --git a/ingest/vendored/.gitrepo b/ingest/vendored/.gitrepo index 71c778a..cf919bc 100644 --- a/ingest/vendored/.gitrepo +++ b/ingest/vendored/.gitrepo @@ -6,7 +6,7 @@ [subrepo] remote = https://github.com/nextstrain/ingest branch = main - commit = c94d78d1f38b99e893007a76526f3d3824ecded0 - parent = ad0b045811bdd327a4caadf1f2ce9b94430dc4cb + commit = 258ab8ce898a88089bc88caee336f8d683a0e79a + parent = c839dc0c4c44805fd29e2437bd6199c9b8d64c5a method = merge cmdver = 0.4.6 diff --git a/ingest/vendored/README.md b/ingest/vendored/README.md index 8c24bd7..a2b54cb 100644 --- a/ingest/vendored/README.md +++ b/ingest/vendored/README.md @@ -117,15 +117,6 @@ Potential Nextstrain CLI scripts - [download-from-s3](download-from-s3) - Download file from AWS S3 bucket with decompression based on file extension in S3 URL. Skips download if the local file already exists and has a hash identical to the S3 object's metadata `sha256sum`. -Potential augur curate scripts - -- [apply-geolocation-rules](apply-geolocation-rules) - Applies user curated geolocation rules to NDJSON records -- [merge-user-metadata](merge-user-metadata) - Merges user annotations with NDJSON records -- [transform-authors](transform-authors) - Abbreviates full author lists to ' et al.' -- [transform-field-names](transform-field-names) - Rename fields of NDJSON records -- [transform-genbank-location](transform-genbank-location) - Parses `location` field with the expected pattern `"[:][, ]"` based on [GenBank's country field](https://www.ncbi.nlm.nih.gov/genbank/collab/country/) -- [transform-strain-names](transform-strain-names) - Ordered search for strain names across several fields. - ## Software requirements Some scripts may require Bash ≥4. If you are running these scripts on macOS, the builtin Bash (`/bin/bash`) does not meet this requirement. You can install [Homebrew's Bash](https://formulae.brew.sh/formula/bash) which is more up to date. @@ -134,11 +125,6 @@ Some scripts may require Bash ≥4. If you are running these scripts on macOS, t Most scripts are untested within this repo, relying on "testing in production". That is the only practical testing option for some scripts such as the ones interacting with S3 and Slack. -For more locally testable scripts, Cram-style functional tests live in `tests` and are run as part of CI. To run these locally, - -1. Download Cram: `pip install cram` -2. Run the tests: `cram tests/` - ## Working on this repo This repo is configured to use [pre-commit](https://pre-commit.com), diff --git a/ingest/vendored/apply-geolocation-rules b/ingest/vendored/apply-geolocation-rules deleted file mode 100755 index 776cf16..0000000 --- a/ingest/vendored/apply-geolocation-rules +++ /dev/null @@ -1,234 +0,0 @@ -#!/usr/bin/env python3 -""" -Applies user curated geolocation rules to the geolocation fields in the NDJSON -records from stdin. The modified records are output to stdout. This does not do -any additional transformations on top of the user curations. -""" -import argparse -import json -from collections import defaultdict -from sys import exit, stderr, stdin, stdout - - -class CyclicGeolocationRulesError(Exception): - pass - - -def load_geolocation_rules(geolocation_rules_file): - """ - Loads the geolocation rules from the provided *geolocation_rules_file*. - Returns the rules as a dict: - { - regions: { - countries: { - divisions: { - locations: corrected_geolocations_tuple - } - } - } - } - """ - geolocation_rules = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) - with open(geolocation_rules_file, 'r') as rules_fh: - for line in rules_fh: - # ignore comments - if line.strip()=="" or line.lstrip()[0] == '#': - continue - - row = line.strip('\n').split('\t') - # Skip lines that cannot be split into raw and annotated geolocations - if len(row) != 2: - print( - f"WARNING: Could not decode geolocation rule {line!r}.", - "Please make sure rules are formatted as", - "'region/country/division/locationregion/country/division/location'.", - file=stderr) - continue - - # remove trailing comments - row[-1] = row[-1].partition('#')[0].rstrip() - raw , annot = tuple( row[0].split('/') ) , tuple( row[1].split('/') ) - - # Skip lines where raw or annotated geolocations cannot be split into 4 fields - if len(raw) != 4: - print( - f"WARNING: Could not decode the raw geolocation {row[0]!r}.", - "Please make sure it is formatted as 'region/country/division/location'.", - file=stderr - ) - continue - - if len(annot) != 4: - print( - f"WARNING: Could not decode the annotated geolocation {row[1]!r}.", - "Please make sure it is formatted as 'region/country/division/location'.", - file=stderr - ) - continue - - - geolocation_rules[raw[0]][raw[1]][raw[2]][raw[3]] = annot - - return geolocation_rules - - -def get_annotated_geolocation(geolocation_rules, raw_geolocation, rule_traversal = None): - """ - Gets the annotated geolocation for the *raw_geolocation* in the provided - *geolocation_rules*. - - Recursively traverses the *geolocation_rules* until we get the annotated - geolocation, which must be a Tuple. Returns `None` if there are no - applicable rules for the provided *raw_geolocation*. - - Rules are applied in the order of region, country, division, location. - First checks the provided raw values for geolocation fields, then if there - are not matches, tries to use general rules marked with '*'. - """ - # Always instantiate the rule traversal as an empty list if not provided, - # e.g. the first call of this recursive function - if rule_traversal is None: - rule_traversal = [] - - current_rules = geolocation_rules - # Traverse the geolocation rules based using the rule_traversal values - for field_value in rule_traversal: - current_rules = current_rules.get(field_value) - # If we hit `None`, then we know there are no matching rules, so stop the rule traversal - if current_rules is None: - break - - # We've found the tuple of the annotated geolocation - if isinstance(current_rules, tuple): - return current_rules - - # We've reach the next level of geolocation rules, - # so try to traverse the rules with the next target in raw_geolocation - if isinstance(current_rules, dict): - next_traversal_target = raw_geolocation[len(rule_traversal)] - rule_traversal.append(next_traversal_target) - return get_annotated_geolocation(geolocation_rules, raw_geolocation, rule_traversal) - - # We did not find any matching rule for the last traversal target - if current_rules is None: - # If we've used all general rules and we still haven't found a match, - # then there are no applicable rules for this geolocation - if all(value == '*' for value in rule_traversal): - return None - - # If we failed to find matching rule with a general rule as the last - # traversal target, then delete all trailing '*'s to reset rule_traversal - # to end with the last index that is currently NOT a '*' - # [A, *, B, *] => [A, *, B] - # [A, B, *, *] => [A, B] - # [A, *, *, *] => [A] - if rule_traversal[-1] == '*': - # Find the index of the first of the consecutive '*' from the - # end of the rule_traversal - # [A, *, B, *] => first_consecutive_general_rule_index = 3 - # [A, B, *, *] => first_consecutive_general_rule_index = 2 - # [A, *, *, *] => first_consecutive_general_rule_index = 1 - for index, field_value in reversed(list(enumerate(rule_traversal))): - if field_value == '*': - first_consecutive_general_rule_index = index - else: - break - - rule_traversal = rule_traversal[:first_consecutive_general_rule_index] - - # Set the final value to '*' in hopes that by moving to a general rule, - # we can find a matching rule. - rule_traversal[-1] = '*' - - return get_annotated_geolocation(geolocation_rules, raw_geolocation, rule_traversal) - - -def transform_geolocations(geolocation_rules, geolocation): - """ - Transform the provided *geolocation* by looking it up in the provided - *geolocation_rules*. - - This will use all rules that apply to the geolocation and rules will - be applied in the order of region, country, division, location. - - Returns the original geolocation if no geolocation rules apply. - - Raises a `CyclicGeolocationRulesError` if more than 1000 rules have - been applied to the raw geolocation. - """ - transformed_values = geolocation - rules_applied = 0 - continue_to_apply = True - - while continue_to_apply: - annotated_values = get_annotated_geolocation(geolocation_rules, transformed_values) - - # Stop applying rules if no annotated values were found - if annotated_values is None: - continue_to_apply = False - else: - rules_applied += 1 - - if rules_applied > 1000: - raise CyclicGeolocationRulesError( - "ERROR: More than 1000 geolocation rules applied on the same entry {geolocation!r}." - ) - - # Create a new list of values for comparison to previous values - new_values = list(transformed_values) - for index, value in enumerate(annotated_values): - # Keep original value if annotated value is '*' - if value != '*': - new_values[index] = value - - # Stop applying rules if this rule did not change the values, - # since this means we've reach rules with '*' that no longer change values - if new_values == transformed_values: - continue_to_apply = False - - transformed_values = new_values - - return transformed_values - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument("--region-field", default="region", - help="Field that contains regions in NDJSON records.") - parser.add_argument("--country-field", default="country", - help="Field that contains countries in NDJSON records.") - parser.add_argument("--division-field", default="division", - help="Field that contains divisions in NDJSON records.") - parser.add_argument("--location-field", default="location", - help="Field that contains location in NDJSON records.") - parser.add_argument("--geolocation-rules", metavar="TSV", required=True, - help="TSV file of geolocation rules with the format: " + - "'' where the raw and annotated geolocations " + - "are formatted as '///'. " + - "If creating a general rule, then the raw field value can be substituted with '*'." + - "Lines starting with '#' will be ignored as comments." + - "Trailing '#' will be ignored as comments.") - - args = parser.parse_args() - - location_fields = [args.region_field, args.country_field, args.division_field, args.location_field] - - geolocation_rules = load_geolocation_rules(args.geolocation_rules) - - for record in stdin: - record = json.loads(record) - - try: - annotated_values = transform_geolocations(geolocation_rules, [record.get(field, '') for field in location_fields]) - except CyclicGeolocationRulesError as e: - print(e, file=stderr) - exit(1) - - for index, field in enumerate(location_fields): - record[field] = annotated_values[index] - - json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') - print() diff --git a/ingest/vendored/merge-user-metadata b/ingest/vendored/merge-user-metadata deleted file mode 100755 index 341c2df..0000000 --- a/ingest/vendored/merge-user-metadata +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 -""" -Merges user curated annotations with the NDJSON records from stdin, with the user -curations overwriting the existing fields. The modified records are output -to stdout. This does not do any additional transformations on top of the user -curations. -""" -import argparse -import csv -import json -from collections import defaultdict -from sys import exit, stdin, stderr, stdout - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument("--annotations", metavar="TSV", required=True, - help="Manually curated annotations TSV file. " + - "The TSV should not have a header and should have exactly three columns: " + - "id to match existing metadata, field name, and field value. " + - "If there are multiple annotations for the same id and field, then the last value is used. " + - "Lines starting with '#' are treated as comments. " + - "Any '#' after the field value are treated as comments.") - parser.add_argument("--id-field", default="accession", - help="The ID field in the metadata to use to merge with the annotations.") - - args = parser.parse_args() - - annotations = defaultdict(dict) - with open(args.annotations, 'r') as annotations_fh: - csv_reader = csv.reader(annotations_fh, delimiter='\t') - for row in csv_reader: - if not row or row[0].lstrip()[0] == '#': - continue - elif len(row) != 3: - print("WARNING: Could not decode annotation line " + "\t".join(row), file=stderr) - continue - id, field, value = row - annotations[id][field] = value.partition('#')[0].rstrip() - - for record in stdin: - record = json.loads(record) - - record_id = record.get(args.id_field) - if record_id is None: - print(f"ERROR: ID field {args.id_field!r} does not exist in record", file=stderr) - exit(1) - - record.update(annotations.get(record_id, {})) - - json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') - print() diff --git a/ingest/vendored/tests/transform-field-names/transform-field-names.t b/ingest/vendored/tests/transform-field-names/transform-field-names.t deleted file mode 100644 index 39aa96f..0000000 --- a/ingest/vendored/tests/transform-field-names/transform-field-names.t +++ /dev/null @@ -1,26 +0,0 @@ -Verify behavior of `transform-field-names` - -If the `--field-map` includes a old field name that is the same as the new field -name, this should be no-op. - - $ echo '{"strain": "A"}' \ - > | $TESTDIR/../../transform-field-names \ - > --field-map "strain=strain" - {"strain":"A"} - -If the `--field-map` overwrites an existing field, then skip renaming and -print a loud warning. - - $ echo '{"strain": "A", "isolate": "B"}' \ - > | $TESTDIR/../../transform-field-names \ - > --field-map "isolate=strain" - WARNING: skipping rename of isolate because record already has a field named strain. - {"strain":"A","isolate":"B"} - -The `--field-map` may overwrite an existing field if using `--force` flag. - - $ echo '{"strain": "A", "isolate": "B"}' \ - > | $TESTDIR/../../transform-field-names \ - > --field-map "isolate=strain" \ - > --force - {"strain":"B"} diff --git a/ingest/vendored/tests/transform-genbank-location/transform-genbank-location.t b/ingest/vendored/tests/transform-genbank-location/transform-genbank-location.t deleted file mode 100644 index a835455..0000000 --- a/ingest/vendored/tests/transform-genbank-location/transform-genbank-location.t +++ /dev/null @@ -1,30 +0,0 @@ -Verify behavior of `transform-genbank-location` around prescence/abscence of -`database` and `location` fields. - -If `location` field is present, transform it. - - $ echo '{"database":"GenBank", "location": "USA:Oregon, Salem" }' \ - > | $TESTDIR/../../transform-genbank-location - {"database":"GenBank","location":"Salem","country":"USA","division":"Oregon"} - -If `database` field is absent, complain. - - $ echo '{"location": "USA:Oregon, Salem" }' \ - > | $TESTDIR/../../transform-genbank-location - Record must contain `database` field to use `transform-genbank-location.` - {"location":"USA:Oregon, Salem"} - -If `database` field has unsupported value, complain. - - $ echo '{"database": "unsupported", "location": "USA:Oregon, Salem" }' \ - > | $TESTDIR/../../transform-genbank-location - Database value of unsupported not supported for `transform-genbank-location`; must be "GenBank" or "RefSeq". - {"database":"unsupported","location":"USA:Oregon, Salem"} - - -If `location` field is absent, complain. - - $ echo '{"database": "GenBank" }' \ - > | $TESTDIR/../../transform-genbank-location - `transform-genbank-location` requires a `location` field; this record does not have one. - {"database":"GenBank"} diff --git a/ingest/vendored/tests/transform-strain-names/transform-strain-names.t b/ingest/vendored/tests/transform-strain-names/transform-strain-names.t deleted file mode 100644 index bde6e6c..0000000 --- a/ingest/vendored/tests/transform-strain-names/transform-strain-names.t +++ /dev/null @@ -1,17 +0,0 @@ -Look for strain name in "strain" or a list of backup fields. - -If strain entry exists, do not do anything. - - $ echo '{"strain": "i/am/a/strain", "strain_s": "other"}' \ - > | $TESTDIR/../../transform-strain-names \ - > --strain-regex '^.+$' \ - > --backup-fields strain_s accession - {"strain":"i/am/a/strain","strain_s":"other"} - -If strain entry does not exists, search the backup fields - - $ echo '{"strain_s": "other"}' \ - > | $TESTDIR/../../transform-strain-names \ - > --strain-regex '^.+$' \ - > --backup-fields accession strain_s - {"strain_s":"other","strain":"other"} diff --git a/ingest/vendored/transform-authors b/ingest/vendored/transform-authors deleted file mode 100755 index 0bade20..0000000 --- a/ingest/vendored/transform-authors +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env python3 -""" -Abbreviates a full list of authors to be ' et al.' of the NDJSON -record from stdin and outputs modified records to stdout. - -Note: This is a "best effort" approach and can potentially mangle the author name. -""" -import argparse -import json -import re -from sys import stderr, stdin, stdout - - -def parse_authors(record: dict, authors_field: str, default_value: str, - index: int, abbr_authors_field: str = None) -> dict: - # Strip and normalize whitespace - new_authors = re.sub(r'\s+', ' ', record[authors_field]) - - if new_authors == "": - new_authors = default_value - else: - # Split authors list on comma/semicolon - # OR "and"/"&" with at least one space before and after - new_authors = re.split(r'(?:\s*[,,;;]\s*|\s+(?:and|&)\s+)', new_authors)[0] - - # if it does not already end with " et al.", add it - if not new_authors.strip('. ').endswith(" et al"): - new_authors += ' et al' - - if abbr_authors_field: - if record.get(abbr_authors_field): - print( - f"WARNING: the {abbr_authors_field!r} field already exists", - f"in record {index} and will be overwritten!", - file=stderr - ) - - record[abbr_authors_field] = new_authors - else: - record[authors_field] = new_authors - - return record - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument("--authors-field", default="authors", - help="The field containing list of authors.") - parser.add_argument("--default-value", default="?", - help="Default value to use if authors list is empty.") - parser.add_argument("--abbr-authors-field", - help="The field for the generated abbreviated authors. " + - "If not provided, the original authors field will be modified.") - - args = parser.parse_args() - - for index, record in enumerate(stdin): - record = json.loads(record) - - parse_authors(record, args.authors_field, args.default_value, index, args.abbr_authors_field) - - json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') - print() diff --git a/ingest/vendored/transform-field-names b/ingest/vendored/transform-field-names deleted file mode 100755 index d26e17f..0000000 --- a/ingest/vendored/transform-field-names +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python3 -""" -Renames fields of the NDJSON record from stdin and outputs modified records -to stdout. -""" -import argparse -import json -from sys import stderr, stdin, stdout - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument("--field-map", nargs="+", - help="Fields names in the NDJSON record mapped to new field names, " + - "formatted as '{old_field_name}={new_field_name}'. " + - "If the old field does not exist in record, the new field will be added with an empty string value. " + - "If the new field already exists in record, then the renaming of the old field will be skipped. " + - "Skips the field if the old field name is the same as the new field name (case-sensitive).") - parser.add_argument("--force", action="store_true", - help="Force renaming of old field even if the new field already exists. " + - "Please keep in mind this will overwrite the value of the new field.") - - args = parser.parse_args() - - field_map = {} - for field in args.field_map: - old_name, new_name = field.split('=') - - if old_name == new_name: - continue - - field_map[old_name] = new_name - - for record in stdin: - record = json.loads(record) - - for old_field, new_field in field_map.items(): - - if record.get(new_field) and not args.force: - print( - f"WARNING: skipping rename of {old_field} because record", - f"already has a field named {new_field}.", - file=stderr - ) - continue - - record[new_field] = record.pop(old_field, '') - - json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') - print() diff --git a/ingest/vendored/transform-genbank-location b/ingest/vendored/transform-genbank-location deleted file mode 100755 index 010955a..0000000 --- a/ingest/vendored/transform-genbank-location +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 -""" -Parses GenBank's 'location' field of the NDJSON record from stdin to 3 separate -fields: 'country', 'division', and 'location'. Checks that a record is from -GenBank by verifying that the 'database' field has a value of "GenBank" or "RefSeq". - -Outputs the modified record to stdout. -""" -import json -from sys import stdin, stderr, stdout - - -def parse_location(record: dict) -> dict: - # Expected pattern for the location field is "[:][, ]" - # See GenBank docs for their "country" field: - # https://www.ncbi.nlm.nih.gov/genbank/collab/country/ - location_field = record.get("location", "") - if not location_field: - print( - "`transform-genbank-location` requires a `location` field; this record does not have one.", - file=stderr, - ) - # bail early because we're not gonna make any changes - return record - - geographic_data = location_field.split(':') - - country = geographic_data[0] - division = '' - location = '' - - if len(geographic_data) == 2: - division , _ , location = geographic_data[1].partition(',') - - record['country'] = country.strip() - record['division'] = division.strip() - record['location'] = location.strip() - - return record - - -if __name__ == '__main__': - - for record in stdin: - record = json.loads(record) - - database = record.get('database', '') - if database in {'GenBank', 'RefSeq'}: - parse_location(record) - else: - if database: - error_msg = f"""Database value of {database} not supported for `transform-genbank-location`; must be "GenBank" or "RefSeq".""" - else: - error_msg = "Record must contain `database` field to use `transform-genbank-location.`" - - print(error_msg, file=stderr) - - json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') - print() diff --git a/ingest/vendored/transform-strain-names b/ingest/vendored/transform-strain-names deleted file mode 100755 index d86c0e4..0000000 --- a/ingest/vendored/transform-strain-names +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -""" -Verifies strain name pattern in the 'strain' field of the NDJSON record from -stdin. Adds a 'strain' field to the record if it does not already exist. - -Outputs the modified records to stdout. -""" -import argparse -import json -import re -from sys import stderr, stdin, stdout - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument("--strain-regex", default="^.+$", - help="Regex pattern for strain names. " + - "Strain names that do not match the pattern will be dropped.") - parser.add_argument("--backup-fields", nargs="*", - help="List of backup fields to use as strain name if the value in 'strain' " + - "does not match the strain regex pattern. " + - "If multiple fields are provided, will use the first field that has a non-empty string.") - - args = parser.parse_args() - - strain_name_pattern = re.compile(args.strain_regex) - - for index, record in enumerate(stdin): - record = json.loads(record) - - # Verify strain name matches the strain regex pattern - if strain_name_pattern.match(record.get('strain', '')) is None: - # Default to empty string if not matching pattern - record['strain'] = '' - # Use non-empty value of backup fields if provided - if args.backup_fields: - for field in args.backup_fields: - if record.get(field): - record['strain'] = str(record[field]) - break - - if record['strain'] == '': - print(f"WARNING: Record number {index} has an empty string as the strain name.", file=stderr) - - - json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') - print()