From eb004d8d41a007be170bfcfdf8aabee779fdf279 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Wed, 17 May 2023 15:39:20 +0100 Subject: [PATCH 1/6] Add fieldOption, skipFieldPattern options --- adtl/__init__.py | 15 +++++++++++++++ schemas/dev.schema.json | 4 ++++ 2 files changed, 19 insertions(+) diff --git a/adtl/__init__.py b/adtl/__init__.py index c59ace1..a18219d 100644 --- a/adtl/__init__.py +++ b/adtl/__init__.py @@ -63,7 +63,19 @@ def get_value_unhashed(row: StrDict, rule: Rule, ctx: Context = None) -> Any: rule, list ): # not a container, is constant return rule + # Check whether field is present if it's allowed to be passed over + if "fieldOption" in rule: + try: + row[rule["fieldOption"]] + row["field"] = row.pop("fieldOption") + except KeyError: + return None if "field" in rule: + if ctx and ctx.get("skip_pattern").match(rule["field"]): + try: + row[rule["field"]] + except KeyError: + return None # do not parse field if condition is not met if "if" in rule and not parse_if(row, rule["if"]): return None @@ -485,6 +497,9 @@ def ctx(self, attribute: str): "defaultDateFormat": self.header.get( "defaultDateFormat", DEFAULT_DATE_FORMAT ), + "skip_pattern": re.compile(self.header.get("skipFieldPattern")) + if self.header.get("skipFieldPattern") + else False, } def validate_spec(self): diff --git a/schemas/dev.schema.json b/schemas/dev.schema.json index ccbc6a0..4f424e7 100644 --- a/schemas/dev.schema.json +++ b/schemas/dev.schema.json @@ -27,6 +27,10 @@ "type": "string", "description": "This is only used with combinedType, specifies a regular expression matching multiple fields" }, + "fieldOption": { + "type": "string", + "description": "Corresponding field name in source file, can be skipped if not present in data" + }, "sensitive": { "type": "boolean", "description": "Indicates to the parser whether the field is sensitive. Usually a sensitive field is hashed or encrypted before storing in the database.", From 5d6d9f079b6c972a634c08c19d31cc9c50559f7e Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Thu, 18 May 2023 17:05:37 +0100 Subject: [PATCH 2/6] Works and passes tests (more needed) but slooow --- adtl/__init__.py | 59 ++++++++++++++++++---------- schemas/dev.schema.json | 9 +++-- tests/__snapshots__/test_parser.ambr | 16 ++++++++ tests/parsers/skip_field.json | 30 ++++++++++++++ tests/schemas/epoch-data.schema.json | 8 +++- tests/sources/skip_field_absent.csv | 3 ++ tests/sources/skip_field_present.csv | 3 ++ tests/test_parser.py | 39 +++++++++++++++++- 8 files changed, 141 insertions(+), 26 deletions(-) create mode 100644 tests/parsers/skip_field.json create mode 100644 tests/sources/skip_field_absent.csv create mode 100644 tests/sources/skip_field_present.csv diff --git a/adtl/__init__.py b/adtl/__init__.py index a18219d..f0a2c54 100644 --- a/adtl/__init__.py +++ b/adtl/__init__.py @@ -11,7 +11,7 @@ from datetime import datetime from pathlib import Path from functools import lru_cache -from typing import Any, Dict, Iterable, List, Optional, Union +from typing import Any, Dict, Iterable, List, Optional, Union, Callable import pint import tomli @@ -64,18 +64,10 @@ def get_value_unhashed(row: StrDict, rule: Rule, ctx: Context = None) -> Any: ): # not a container, is constant return rule # Check whether field is present if it's allowed to be passed over - if "fieldOption" in rule: - try: - row[rule["fieldOption"]] - row["field"] = row.pop("fieldOption") - except KeyError: - return None if "field" in rule: - if ctx and ctx.get("skip_pattern").match(rule["field"]): - try: - row[rule["field"]] - except KeyError: - return None + # do not check for condition if field is missing + if skip_field(row, rule, ctx): + return None # do not parse field if condition is not met if "if" in rule and not parse_if(row, rule["if"]): return None @@ -153,19 +145,33 @@ def matching_fields(fields: List[str], pattern: str) -> List[str]: return [f for f in fields if compiled_pattern.match(f)] -def parse_if(row: StrDict, rule: StrDict) -> bool: +def parse_if(row: StrDict, rule: StrDict, ctx: Callable = None, can_skip=False) -> bool: "Parse conditional statements and return a boolean" n_keys = len(rule.keys()) - assert n_keys == 1 + # assert n_keys == 1 + assert n_keys == 1 or n_keys == 2 + if n_keys == 2: + assert "can_skip" in rule + can_skip = True key = next(iter(rule.keys())) if key == "not" and isinstance(rule[key], dict): - return not parse_if(row, rule[key]) + return not parse_if(row, rule[key], ctx, can_skip) elif key == "any" and isinstance(rule[key], list): - return any(parse_if(row, r) for r in rule[key]) + return any(parse_if(row, r, ctx, can_skip) for r in rule[key]) elif key == "all" and isinstance(rule[key], list): - return all(parse_if(row, r) for r in rule[key]) - attr_value = row[key] + return all(parse_if(row, r, ctx, can_skip) for r in rule[key]) + try: + attr_value = row[key] + except KeyError as e: + if can_skip == True: + return False + elif ctx: + if skip_field(row, {"field": key}, ctx(key)): + return False + else: + raise e + if isinstance(rule[key], dict): cmp = next(iter(rule[key])) value = rule[key][cmp] @@ -562,8 +568,16 @@ def default_if(self, table: str, rule: StrDict): if "combinedType" not in rule[option]: field = rule[option]["field"] - if "values" in rule[option]: + if "values" in rule[option] and "can_skip" in rule[option]: + if_rule = { + "any": [ + {field: v, "can_skip": True} for v in rule[option]["values"] + ] + } + elif "values" in rule[option]: if_rule = {"any": [{field: v} for v in rule[option]["values"]]} + elif "can_skip" in rule[option]: + if_rule = {field: {"!=": ""}, "can_skip": True} else: if_rule = {field: {"!=": ""}} else: @@ -582,6 +596,11 @@ def default_if(self, table: str, rule: StrDict): ) if_rule = {"any": sum(map(condition, rules), [])} + for ir in if_rule["any"]: + for r in rules: + if str(list(ir.keys())[0]) in r.values() and "can_skip" in r.keys(): + ir["can_skip"] = True + rule["if"] = if_rule return rule @@ -603,7 +622,7 @@ def update_table(self, table: str, row: StrDict): for match in self.spec[table]: if "if" not in match: match = self.default_if(table, match) - if parse_if(row, match["if"]): + if parse_if(row, match["if"], self.ctx): self.data[table].append( remove_null_keys( { diff --git a/schemas/dev.schema.json b/schemas/dev.schema.json index 4f424e7..371a519 100644 --- a/schemas/dev.schema.json +++ b/schemas/dev.schema.json @@ -27,10 +27,6 @@ "type": "string", "description": "This is only used with combinedType, specifies a regular expression matching multiple fields" }, - "fieldOption": { - "type": "string", - "description": "Corresponding field name in source file, can be skipped if not present in data" - }, "sensitive": { "type": "boolean", "description": "Indicates to the parser whether the field is sensitive. Usually a sensitive field is hashed or encrypted before storing in the database.", @@ -88,6 +84,11 @@ ] } } + }, + "can_skip": { + "type": "boolean", + "description": "Indicates to the parser whether the field can be skipped without throwing an error if missing in the data.", + "default": false } } } diff --git a/tests/__snapshots__/test_parser.ambr b/tests/__snapshots__/test_parser.ambr index 69bcf4b..033157e 100644 --- a/tests/__snapshots__/test_parser.ambr +++ b/tests/__snapshots__/test_parser.ambr @@ -23,6 +23,22 @@ ''' # --- +# name: test_skip_field_pattern_absent + ''' + adtl_valid,adtl_error,cough,epoch,followup_cough,id,text + False,data.epoch must be date,1,11/01/1999,,1,Lorem ipsum + False,data.epoch must be date,0,19/12/2022,,2,example + + ''' +# --- +# name: test_skip_field_pattern_present + ''' + adtl_valid,adtl_error,cough,epoch,followup_cough,id,text + False,data.epoch must be date,1,11/01/1999,0,1,Lorem ipsum + False,data.epoch must be date,0,19/12/2022,1,2,example + + ''' +# --- # name: test_validation ''' adtl_valid,adtl_error,admission_date,country_iso3,dataset_id,enrolment_date,ethnicity,sex_at_birth,subject_id diff --git a/tests/parsers/skip_field.json b/tests/parsers/skip_field.json new file mode 100644 index 0000000..725c8fe --- /dev/null +++ b/tests/parsers/skip_field.json @@ -0,0 +1,30 @@ +{ + "adtl": { + "name": "allow-skip-field-pattern", + "description": "Tests skipping missing fields", + "skipFieldPattern": "flw.*", + "tables": { + "table": { + "kind": "oneToOne", + "schema": "../schemas/epoch-data.schema.json" + } + } + }, + "table": { + "id": { + "field": "Entry_ID" + }, + "epoch": { + "field": "Epoch" + }, + "text": { + "field": "Text" + }, + "cough": { + "field": "cough" + }, + "followup_cough": { + "field": "flw_cough" + } + } +} \ No newline at end of file diff --git a/tests/schemas/epoch-data.schema.json b/tests/schemas/epoch-data.schema.json index 5ba856e..abdc5f7 100644 --- a/tests/schemas/epoch-data.schema.json +++ b/tests/schemas/epoch-data.schema.json @@ -21,6 +21,12 @@ }, "text": { "description": "Text field" + }, + "cough": { + "description": "Standard cough field" + }, + "followup_cough": { + "description": "Follow-up cough field" } } -} +} \ No newline at end of file diff --git a/tests/sources/skip_field_absent.csv b/tests/sources/skip_field_absent.csv new file mode 100644 index 0000000..9962f55 --- /dev/null +++ b/tests/sources/skip_field_absent.csv @@ -0,0 +1,3 @@ +Entry_ID,Epoch,Text,cough +1,11/01/1999,Lorem ipsum,1 +2,19/12/2022,example,0 diff --git a/tests/sources/skip_field_present.csv b/tests/sources/skip_field_present.csv new file mode 100644 index 0000000..515c984 --- /dev/null +++ b/tests/sources/skip_field_present.csv @@ -0,0 +1,3 @@ +Entry_ID,Epoch,Text,cough,flw_cough +1,11/01/1999,Lorem ipsum,1,0 +2,19/12/2022,example,0,1 diff --git a/tests/test_parser.py b/tests/test_parser.py index 1ff7621..a8a358c 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -279,6 +279,23 @@ }, ] +RULE_FIELD_OPTION = { + "field": "aidshiv_mhyn", + "values": {"1": True, "0": False}, + "can_skip": True, +} + +# OBSERVATION_RULE_FIELD_OPTION = { +# "name": "bleeding", +# "phase": "admission", +# "date": "2023-05-18", +# "is_present": { +# "field": "bleed_ceterm_v2", +# "values": {"1": True, "0": False}, +# "can_skip": True, +# }, +# } + @pytest.mark.parametrize( "row_rule,expected", @@ -350,6 +367,9 @@ unordered(["Lopinavir/Ritonvir", "Interferon alpha"]), ), (({"first": "", "second": ""}, RULE_COMBINED_FIRST_NON_NULL), None), + (({"aidshiv": "1"}, RULE_FIELD_OPTION), None), + (({"aidshiv_mhyn": "1"}, RULE_FIELD_OPTION), True), + (({"aidshiv_mhyn": "2"}, RULE_FIELD_OPTION), None), ], ) def test_get_value(row_rule, expected): @@ -405,7 +425,6 @@ def test_one_to_many(): assert actual_one_many_output_csv == ONE_MANY_OUTPUT -# HERE def test_one_to_many_correct_if_behaviour(): actual_row = list( parser.Parser(TEST_PARSERS_PATH / "oneToMany-missingIf.toml") @@ -781,3 +800,21 @@ def test_apply_in_observations_table(): ) assert apply_observations_output == APPLY_OBSERVATIONS_OUTPUT + + +def test_skip_field_pattern_present(snapshot): + transformed_csv_data = ( + parser.Parser(TEST_PARSERS_PATH / "skip_field.json") + .parse(TEST_SOURCES_PATH / "skip_field_present.csv") + .write_csv("table") + ) + assert transformed_csv_data == snapshot + + +def test_skip_field_pattern_absent(snapshot): + transformed_csv_data = ( + parser.Parser(TEST_PARSERS_PATH / "skip_field.json") + .parse(TEST_SOURCES_PATH / "skip_field_absent.csv") + .write_csv("table") + ) + assert transformed_csv_data == snapshot From 0c0b1426592b011feaea715b2dca71adaa63c733 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Fri, 19 May 2023 13:51:40 +0100 Subject: [PATCH 3/6] tidied up --- adtl/__init__.py | 62 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 18 deletions(-) diff --git a/adtl/__init__.py b/adtl/__init__.py index f0a2c54..5807e6d 100644 --- a/adtl/__init__.py +++ b/adtl/__init__.py @@ -419,6 +419,24 @@ def read_definition(file: Path) -> Dict[str, Any]: raise ValueError(f"Unsupported file format: {file}") +def skip_field(row, rule, ctx: Context = None): + "Returns True if the field is missing and allowed to be skipped" + # made no difference + if "can_skip" in rule: + if rule["can_skip"]: + if rule["field"] not in row: + return True + else: + return False + if ctx and ctx.get("skip_pattern"): + if ctx.get("skip_pattern").match(rule["field"]): + if rule["field"] not in row: + return True + else: + return False + return False + + class Parser: def __init__(self, spec: Union[str, Path, StrDict], include_defs: List[str] = []): "Loads specification from spec in format (default json)" @@ -568,14 +586,12 @@ def default_if(self, table: str, rule: StrDict): if "combinedType" not in rule[option]: field = rule[option]["field"] - if "values" in rule[option] and "can_skip" in rule[option]: - if_rule = { - "any": [ - {field: v, "can_skip": True} for v in rule[option]["values"] - ] - } - elif "values" in rule[option]: - if_rule = {"any": [{field: v} for v in rule[option]["values"]]} + if "values" in rule[option]: + values = rule[option]["values"] + if "can_skip" in rule[option]: + if_rule = {"any": [{field: v, "can_skip": True} for v in values]} + else: + if_rule = {"any": [{field: v} for v in values]} elif "can_skip" in rule[option]: if_rule = {field: {"!=": ""}, "can_skip": True} else: @@ -589,17 +605,27 @@ def default_if(self, table: str, rule: StrDict): "list", ], f"Invalid combinedType: {rule[option]['combinedType']}" rules = rule[option]["fields"] - condition = ( - lambda rule: [{rule["field"]: v} for v in rule["values"]] - if "values" in rule - else [{rule["field"]: {"!=": ""}}] - ) - if_rule = {"any": sum(map(condition, rules), [])} - for ir in if_rule["any"]: - for r in rules: - if str(list(ir.keys())[0]) in r.values() and "can_skip" in r.keys(): - ir["can_skip"] = True + def create_if_rule(rule): # better, but not faster + field = rule["field"] + values = rule.get("values", []) + can_skip = rule.get("can_skip", False) + + if_condition = {} + + if values and can_skip: + if_condition = [{field: v, "can_skip": True} for v in values] + elif values: + if_condition = [{field: v} for v in values] + elif can_skip: + if_condition[field] = {"!=": ""} + if_condition["can_skip"] = True + else: + if_condition[field] = {"!=": ""} + + return if_condition + + if_rule = {"any": sum(map(create_if_rule, rules), [])} rule["if"] = if_rule return rule From c69b7b2fccd742a0410695a6d1cbe687988baa79 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Fri, 19 May 2023 15:04:45 +0100 Subject: [PATCH 4/6] simplify skip_field --- adtl/__init__.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/adtl/__init__.py b/adtl/__init__.py index 5807e6d..ac9566e 100644 --- a/adtl/__init__.py +++ b/adtl/__init__.py @@ -422,18 +422,10 @@ def read_definition(file: Path) -> Dict[str, Any]: def skip_field(row, rule, ctx: Context = None): "Returns True if the field is missing and allowed to be skipped" # made no difference - if "can_skip" in rule: - if rule["can_skip"]: - if rule["field"] not in row: - return True - else: - return False - if ctx and ctx.get("skip_pattern"): - if ctx.get("skip_pattern").match(rule["field"]): - if rule["field"] not in row: - return True - else: - return False + if rule.get("can_skip"): + return rule["field"] not in row + if ctx and ctx.get("skip_pattern") and ctx.get("skip_pattern").match(rule["field"]): + return rule["field"] not in row return False From 835c69dd04dfcaa303dc6142e0ba109e45e6b073 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Fri, 19 May 2023 15:59:34 +0100 Subject: [PATCH 5/6] Add documentation, edit schema --- adtl/__init__.py | 7 ++++--- docs/specification.md | 44 +++++++++++++++++++++++++++++++++++++++++ schemas/dev.schema.json | 5 ++--- 3 files changed, 50 insertions(+), 6 deletions(-) diff --git a/adtl/__init__.py b/adtl/__init__.py index ac9566e..3973242 100644 --- a/adtl/__init__.py +++ b/adtl/__init__.py @@ -145,7 +145,9 @@ def matching_fields(fields: List[str], pattern: str) -> List[str]: return [f for f in fields if compiled_pattern.match(f)] -def parse_if(row: StrDict, rule: StrDict, ctx: Callable = None, can_skip=False) -> bool: +def parse_if( + row: StrDict, rule: StrDict, ctx: Callable[[str], dict] = None, can_skip=False +) -> bool: "Parse conditional statements and return a boolean" n_keys = len(rule.keys()) @@ -419,9 +421,8 @@ def read_definition(file: Path) -> Dict[str, Any]: raise ValueError(f"Unsupported file format: {file}") -def skip_field(row, rule, ctx: Context = None): +def skip_field(row: StrDict, rule: StrDict, ctx: Context = None): "Returns True if the field is missing and allowed to be skipped" - # made no difference if rule.get("can_skip"): return rule["field"] not in row if ctx and ctx.get("skip_pattern") and ctx.get("skip_pattern").match(rule["field"]): diff --git a/docs/specification.md b/docs/specification.md index 4256b38..26e6656 100644 --- a/docs/specification.md +++ b/docs/specification.md @@ -45,6 +45,8 @@ These metadata fields are defined under a header key `adtl`. * **defs**: Definitions that can be referred to elsewhere in the schema * **include-def** (list): List of additional TOML or JSON files to import as definitions +* **skipFieldPattern** : Regex string matching field names which may be skipped +if not present in a datafile, following the same syntax as `fieldPattern` key. * **defaultDateFormat**: Default source date format, applied to all fields with either "date_" / "_date" in the field name or that have format date set in the JSON schema @@ -297,6 +299,48 @@ fields = [ If *excludeWhen* is not set, no exclusions take place and all values are returned as-is. +### Skippable fields + +In some cases, a study will be assocaited with multiple data files, all of which have been +filled in to varying degrees. For example, one study site may not provide any follow-up data. + +Rather than writing a new parser for every data file with minor differences, parsers can be made +robust to a certain amount of missing data by tagging applicable fields with `can_skip = True`, +for example: + +```ini +[[observation]] + name = "cough" + phase = "admission" + date = { field = "admit_date" } + is_present = { field = "cough_ceoccur_v2", description = "Cough", ref = "Y/N/NK", "can_skip" = true } +``` + +In this case, if adtl does not find `cough_ceoccur_v2` in the data it will skip over the field +and continue, rather than throwing an error. + +If there are lots of fields missing all with similar field names, for example if followup data +has been omitted and all the followup fields are labelled with a `flw` prefix e.g., `flw_cough`, +`flw2_fatigue`, this can be specified at the top of the file: + +```ini +[adtl] + name = "isaric-core" + description = "isaric-core" + skipFieldPattern = "flw.*" + +[table.sex_at_birth] + combinedType = "firstNonNull" + excludeWhen = "none" + fields = [ + { field = "sex", values = { 1 = "male", 2 = "female" } }, + { field = "flw_sex_at_birth", values = { 1 = "male", 2 = "female", 3 = "non_binary" } }, + { field = "flw2_sex_at_birth", values = { 1 = "male", 2 = "female", 3 = "non_binary" } }, + ] +``` + +Notice that in this case `can_skip` does not need to be added to the fields with a `flw` prefix. + ### Data transformations (apply) Arbitrary functions can be applied to source fields. adtl ships with a library diff --git a/schemas/dev.schema.json b/schemas/dev.schema.json index 371a519..c794ca8 100644 --- a/schemas/dev.schema.json +++ b/schemas/dev.schema.json @@ -86,9 +86,8 @@ } }, "can_skip": { - "type": "boolean", - "description": "Indicates to the parser whether the field can be skipped without throwing an error if missing in the data.", - "default": false + "const": true, + "description": "Indicates to the parser whether the field can be skipped without throwing an error if missing in the data." } } } From c36d03a756f7fa8523937795404008e382800bcd Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Mon, 22 May 2023 12:49:50 +0100 Subject: [PATCH 6/6] Add tests --- adtl/__init__.py | 5 +- tests/__snapshots__/test_parser.ambr | 12 +- tests/parsers/oneToMany-missingIf.toml | 13 +++ tests/parsers/skip_field.json | 4 + .../schemas/observation_defaultif.schema.json | 4 +- tests/sources/oneToManyIf.csv | 4 +- tests/sources/skip_field_present.csv | 6 +- tests/test_parser.py | 107 +++++++++++++++--- 8 files changed, 126 insertions(+), 29 deletions(-) diff --git a/adtl/__init__.py b/adtl/__init__.py index 3973242..c1609b4 100644 --- a/adtl/__init__.py +++ b/adtl/__init__.py @@ -151,7 +151,6 @@ def parse_if( "Parse conditional statements and return a boolean" n_keys = len(rule.keys()) - # assert n_keys == 1 assert n_keys == 1 or n_keys == 2 if n_keys == 2: assert "can_skip" in rule @@ -166,7 +165,7 @@ def parse_if( try: attr_value = row[key] except KeyError as e: - if can_skip == True: + if can_skip is True: return False elif ctx: if skip_field(row, {"field": key}, ctx(key)): @@ -599,7 +598,7 @@ def default_if(self, table: str, rule: StrDict): ], f"Invalid combinedType: {rule[option]['combinedType']}" rules = rule[option]["fields"] - def create_if_rule(rule): # better, but not faster + def create_if_rule(rule): field = rule["field"] values = rule.get("values", []) can_skip = rule.get("can_skip", False) diff --git a/tests/__snapshots__/test_parser.ambr b/tests/__snapshots__/test_parser.ambr index 033157e..0ca1170 100644 --- a/tests/__snapshots__/test_parser.ambr +++ b/tests/__snapshots__/test_parser.ambr @@ -25,17 +25,17 @@ # --- # name: test_skip_field_pattern_absent ''' - adtl_valid,adtl_error,cough,epoch,followup_cough,id,text - False,data.epoch must be date,1,11/01/1999,,1,Lorem ipsum - False,data.epoch must be date,0,19/12/2022,,2,example + adtl_valid,adtl_error,cough,epoch,followup_cough,headache,id,text + False,data.epoch must be date,1,11/01/1999,,,1,Lorem ipsum + False,data.epoch must be date,0,19/12/2022,,,2,example ''' # --- # name: test_skip_field_pattern_present ''' - adtl_valid,adtl_error,cough,epoch,followup_cough,id,text - False,data.epoch must be date,1,11/01/1999,0,1,Lorem ipsum - False,data.epoch must be date,0,19/12/2022,1,2,example + adtl_valid,adtl_error,cough,epoch,followup_cough,headache,id,text + False,data.epoch must be date,1,11/01/1999,0,3,1,Lorem ipsum + False,data.epoch must be date,0,19/12/2022,1,0,2,example ''' # --- diff --git a/tests/parsers/oneToMany-missingIf.toml b/tests/parsers/oneToMany-missingIf.toml index 75fa1b7..8c4bb09 100644 --- a/tests/parsers/oneToMany-missingIf.toml +++ b/tests/parsers/oneToMany-missingIf.toml @@ -1,6 +1,7 @@ [adtl] name = "sampleOneToMany - missingIf" description = "One to Many example where if statements are removed" + skipFieldPattern = "flw3.*" [adtl.tables.observation] kind = "oneToMany" @@ -47,3 +48,15 @@ is_present = { field = "flw2_fever_{n}", values = { 0 = false, 1 = true } } # if.any = [ { "flw2_fever_{n}" = 1 }, { "flw2_fever_{n}" = 0 } ] for.n.range = [1, 2] + +[[observation]] + name = "fatigue_malaise" + phase = "followup" + date = { field = "dt" } + is_present = { field = "flw3_fatigue", description = "Fatigue", values = { 1 = true, 0 = false } } + +[[observation]] + name = "severe_dehydration" + phase = "admission" + date = { field = "dt" } + is_present = { field = "dehydration_vsorres", description = "Severe dehydration:", ref = "Y/N/NK", "can_skip" = true } diff --git a/tests/parsers/skip_field.json b/tests/parsers/skip_field.json index 725c8fe..97079cc 100644 --- a/tests/parsers/skip_field.json +++ b/tests/parsers/skip_field.json @@ -25,6 +25,10 @@ }, "followup_cough": { "field": "flw_cough" + }, + "headache": { + "field": "headache", + "can_skip": true } } } \ No newline at end of file diff --git a/tests/schemas/observation_defaultif.schema.json b/tests/schemas/observation_defaultif.schema.json index 073b584..525f42c 100644 --- a/tests/schemas/observation_defaultif.schema.json +++ b/tests/schemas/observation_defaultif.schema.json @@ -58,7 +58,9 @@ "headache", "oxygen_saturation", "pao2_sample_type", - "history_of_fever" + "history_of_fever", + "fatigue_malaise", + "severe_dehydration" ], "description": "Observation name" } diff --git a/tests/sources/oneToManyIf.csv b/tests/sources/oneToManyIf.csv index 58a58df..fa5bd1e 100644 --- a/tests/sources/oneToManyIf.csv +++ b/tests/sources/oneToManyIf.csv @@ -1,2 +1,2 @@ -dt,dt_1,dt_2,headache_v2,oxy_vsorres,cough_ceoccur_v2,dry_cough_ceoccur_v2,wet_cough_ceoccur_v2,pao2_lbspec,flw2_fever_1,flw2_fever_2 -2022-02-05,2022-02-06,2022-02-07,2,87,3,1,2,3,1,0 +dt,dt_1,dt_2,headache_v2,oxy_vsorres,cough_ceoccur_v2,dry_cough_ceoccur_v2,wet_cough_ceoccur_v2,pao2_lbspec,flw2_fever_1,flw2_fever_2,flw3_fatigue,dehydration_vsorres +2022-02-05,2022-02-06,2022-02-07,2,87,3,1,2,3,1,0,1,2 diff --git a/tests/sources/skip_field_present.csv b/tests/sources/skip_field_present.csv index 515c984..3deab0a 100644 --- a/tests/sources/skip_field_present.csv +++ b/tests/sources/skip_field_present.csv @@ -1,3 +1,3 @@ -Entry_ID,Epoch,Text,cough,flw_cough -1,11/01/1999,Lorem ipsum,1,0 -2,19/12/2022,example,0,1 +Entry_ID,Epoch,Text,cough,flw_cough,headache +1,11/01/1999,Lorem ipsum,1,0,3 +2,19/12/2022,example,0,1,0 diff --git a/tests/test_parser.py b/tests/test_parser.py index a8a358c..0ba633a 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -148,6 +148,20 @@ "is_present": False, "adtl_valid": True, }, + { + "date": "2022-02-05", + "name": "fatigue_malaise", + "phase": "followup", + "is_present": True, + "adtl_valid": True, + }, + { + "date": "2022-02-05", + "name": "severe_dehydration", + "phase": "admission", + "is_present": False, + "adtl_valid": True, + }, ] ONE_MANY_IF_MISSINGDATA_OUTPUT = [ @@ -279,22 +293,54 @@ }, ] -RULE_FIELD_OPTION = { +RULE_FIELD_OPTION_SKIP = { "field": "aidshiv_mhyn", "values": {"1": True, "0": False}, "can_skip": True, } -# OBSERVATION_RULE_FIELD_OPTION = { -# "name": "bleeding", -# "phase": "admission", -# "date": "2023-05-18", -# "is_present": { -# "field": "bleed_ceterm_v2", -# "values": {"1": True, "0": False}, -# "can_skip": True, -# }, -# } +OBSERVATION_RULE_FIELD_OPTION_SKIP = { + "name": "bleeding", + "phase": "admission", + "date": "2023-05-18", + "is_present": { + "field": "bleed_ceterm_v2", + "values": {"1": True, "0": False}, + "can_skip": True, + }, +} +OBSERVATION_RULE_FIELD_OPTION_VALUE = { + "name": "temperature_celsius", + "phase": "admission", + "date": "2023-05-22", + "value": { + "field": "temp_vsorres", + "source_unit": {"field": "temp_vsorresu", "values": {"1": "°C", "2": "°F"}}, + }, +} + +OBSERVATION_RULE_FIELD_OPTION_COMB = { + "name": "cough", + "phase": "admission", + "date": "2023-05-22", + "is_present": { + "combinedType": "any", + "excludeWhen": "none", + "fields": [ + {"field": "cough_ceoccur_v2", "values": {"1": "true", "0": "false"}}, + { + "field": "coughsput_ceoccur_v2", + "values": {"1": "true", "0": "false"}, + "can_skip": "true", + }, + { + "field": "coughhb_ceoccur_v2", + "values": {"1": "true", "0": "false"}, + "can_skip": "true", + }, + ], + }, +} @pytest.mark.parametrize( @@ -367,9 +413,9 @@ unordered(["Lopinavir/Ritonvir", "Interferon alpha"]), ), (({"first": "", "second": ""}, RULE_COMBINED_FIRST_NON_NULL), None), - (({"aidshiv": "1"}, RULE_FIELD_OPTION), None), - (({"aidshiv_mhyn": "1"}, RULE_FIELD_OPTION), True), - (({"aidshiv_mhyn": "2"}, RULE_FIELD_OPTION), None), + (({"aidshiv": "1"}, RULE_FIELD_OPTION_SKIP), None), + (({"aidshiv_mhyn": "1"}, RULE_FIELD_OPTION_SKIP), True), + (({"aidshiv_mhyn": "2"}, RULE_FIELD_OPTION_SKIP), None), ], ) def test_get_value(row_rule, expected): @@ -425,6 +471,39 @@ def test_one_to_many(): assert actual_one_many_output_csv == ONE_MANY_OUTPUT +@pytest.mark.parametrize( + "rule,expected", + [ + ( + OBSERVATION_RULE_FIELD_OPTION_SKIP, + { + "any": [ + {"bleed_ceterm_v2": "1", "can_skip": True}, + {"bleed_ceterm_v2": "0", "can_skip": True}, + ] + }, + ), + (OBSERVATION_RULE_FIELD_OPTION_VALUE, {"temp_vsorres": {"!=": ""}}), + ( + OBSERVATION_RULE_FIELD_OPTION_COMB, + { + "any": [ + {"cough_ceoccur_v2": "1"}, + {"cough_ceoccur_v2": "0"}, + {"coughsput_ceoccur_v2": "1", "can_skip": True}, + {"coughsput_ceoccur_v2": "0", "can_skip": True}, + {"coughhb_ceoccur_v2": "1", "can_skip": True}, + {"coughhb_ceoccur_v2": "0", "can_skip": True}, + ] + }, + ), + ], +) +def test_default_if_rule_is_correct(rule, expected): + psr = parser.Parser(TEST_PARSERS_PATH / "oneToMany-missingIf.toml") + assert psr.default_if("observation", rule)["if"] == expected + + def test_one_to_many_correct_if_behaviour(): actual_row = list( parser.Parser(TEST_PARSERS_PATH / "oneToMany-missingIf.toml")