globaldothealth · abhidg · May 23, 2023 · May 17, 2023 · May 18, 2023 · May 19, 2023
diff --git a/adtl/__init__.py b/adtl/__init__.py
@@ -11,7 +11,7 @@
 from datetime import datetime
 from pathlib import Path
 from functools import lru_cache
-from typing import Any, Dict, Iterable, List, Optional, Union
+from typing import Any, Dict, Iterable, List, Optional, Union, Callable
 
 import pint
 import tomli
@@ -63,7 +63,11 @@ def get_value_unhashed(row: StrDict, rule: Rule, ctx: Context = None) -> Any:
         rule, list
     ):  # not a container, is constant
         return rule
+    # Check whether field is present if it's allowed to be passed over
     if "field" in rule:
+        # do not check for condition if field is missing
+        if skip_field(row, rule, ctx):
+            return None
         # do not parse field if condition is not met
         if "if" in rule and not parse_if(row, rule["if"]):
             return None
@@ -141,19 +145,34 @@ def matching_fields(fields: List[str], pattern: str) -> List[str]:
     return [f for f in fields if compiled_pattern.match(f)]
 
 
-def parse_if(row: StrDict, rule: StrDict) -> bool:
+def parse_if(
+    row: StrDict, rule: StrDict, ctx: Callable[[str], dict] = None, can_skip=False
+) -> bool:
     "Parse conditional statements and return a boolean"
 
     n_keys = len(rule.keys())
-    assert n_keys == 1
+    assert n_keys == 1 or n_keys == 2
+    if n_keys == 2:
+        assert "can_skip" in rule
+        can_skip = True
     key = next(iter(rule.keys()))
     if key == "not" and isinstance(rule[key], dict):
-        return not parse_if(row, rule[key])
+        return not parse_if(row, rule[key], ctx, can_skip)
     elif key == "any" and isinstance(rule[key], list):
-        return any(parse_if(row, r) for r in rule[key])
+        return any(parse_if(row, r, ctx, can_skip) for r in rule[key])
     elif key == "all" and isinstance(rule[key], list):
-        return all(parse_if(row, r) for r in rule[key])
-    attr_value = row[key]
+        return all(parse_if(row, r, ctx, can_skip) for r in rule[key])
+    try:
+        attr_value = row[key]
+    except KeyError as e:
+        if can_skip is True:
+            return False
+        elif ctx:
+            if skip_field(row, {"field": key}, ctx(key)):
+                return False
+        else:
+            raise e
+
     if isinstance(rule[key], dict):
         cmp = next(iter(rule[key]))
         value = rule[key][cmp]
@@ -401,6 +420,15 @@ def read_definition(file: Path) -> Dict[str, Any]:
         raise ValueError(f"Unsupported file format: {file}")
 
 
+def skip_field(row: StrDict, rule: StrDict, ctx: Context = None):
+    "Returns True if the field is missing and allowed to be skipped"
+    if rule.get("can_skip"):
+        return rule["field"] not in row
+    if ctx and ctx.get("skip_pattern") and ctx.get("skip_pattern").match(rule["field"]):
+        return rule["field"] not in row
+    return False
+
+
 class Parser:
     def __init__(self, spec: Union[str, Path, StrDict], include_defs: List[str] = []):
         "Loads specification from spec in format (default json)"
@@ -485,6 +513,9 @@ def ctx(self, attribute: str):
             "defaultDateFormat": self.header.get(
                 "defaultDateFormat", DEFAULT_DATE_FORMAT
             ),
+            "skip_pattern": re.compile(self.header.get("skipFieldPattern"))
+            if self.header.get("skipFieldPattern")
+            else False,
         }
 
     def validate_spec(self):
@@ -548,7 +579,13 @@ def default_if(self, table: str, rule: StrDict):
         if "combinedType" not in rule[option]:
             field = rule[option]["field"]
             if "values" in rule[option]:
-                if_rule = {"any": [{field: v} for v in rule[option]["values"]]}
+                values = rule[option]["values"]
+                if "can_skip" in rule[option]:
+                    if_rule = {"any": [{field: v, "can_skip": True} for v in values]}
+                else:
+                    if_rule = {"any": [{field: v} for v in values]}
+            elif "can_skip" in rule[option]:
+                if_rule = {field: {"!=": ""}, "can_skip": True}
             else:
                 if_rule = {field: {"!=": ""}}
         else:
@@ -560,12 +597,27 @@ def default_if(self, table: str, rule: StrDict):
                 "list",
             ], f"Invalid combinedType: {rule[option]['combinedType']}"
             rules = rule[option]["fields"]
-            condition = (
-                lambda rule: [{rule["field"]: v} for v in rule["values"]]
-                if "values" in rule
-                else [{rule["field"]: {"!=": ""}}]
-            )
-            if_rule = {"any": sum(map(condition, rules), [])}
+
+            def create_if_rule(rule):
+                field = rule["field"]
+                values = rule.get("values", [])
+                can_skip = rule.get("can_skip", False)
+
+                if_condition = {}
+
+                if values and can_skip:
+                    if_condition = [{field: v, "can_skip": True} for v in values]
+                elif values:
+                    if_condition = [{field: v} for v in values]
+                elif can_skip:
+                    if_condition[field] = {"!=": ""}
+                    if_condition["can_skip"] = True
+                else:
+                    if_condition[field] = {"!=": ""}
+
+                return if_condition
+
+            if_rule = {"any": sum(map(create_if_rule, rules), [])}
 
         rule["if"] = if_rule
         return rule
@@ -588,7 +640,7 @@ def update_table(self, table: str, row: StrDict):
             for match in self.spec[table]:
                 if "if" not in match:
                     match = self.default_if(table, match)
-                if parse_if(row, match["if"]):
+                if parse_if(row, match["if"], self.ctx):
                     self.data[table].append(
                         remove_null_keys(
                             {

diff --git a/docs/specification.md b/docs/specification.md
@@ -45,6 +45,8 @@ These metadata fields are defined under a header key `adtl`.
 * **defs**: Definitions that can be referred to elsewhere in the schema
 * **include-def** (list): List of additional TOML or JSON files to import as
   definitions
+* **skipFieldPattern** : Regex string matching field names which may be skipped
+if not present in a datafile, following the same syntax as `fieldPattern` key.
 * **defaultDateFormat**: Default source date format, applied to all fields
   with either "date_" / "_date" in the field name or that have format date
   set in the JSON schema
@@ -297,6 +299,48 @@ fields =  [
 
 If *excludeWhen* is not set, no exclusions take place and all values are returned as-is.
 
+### Skippable fields
+
+In some cases, a study will be assocaited with multiple data files, all of which have been
+filled in to varying degrees. For example, one study site may not provide any follow-up data.
+
+Rather than writing a new parser for every data file with minor differences, parsers can be made
+robust to a certain amount of missing data by tagging applicable fields with `can_skip = True`,
+for example:
+
+```ini
+[[observation]]
+  name = "cough"
+  phase = "admission"
+  date = { field = "admit_date" }
+  is_present = { field = "cough_ceoccur_v2", description = "Cough", ref = "Y/N/NK", "can_skip" = true }
+```
+
+In this case, if adtl does not find `cough_ceoccur_v2` in the data it will skip over the field
+and continue, rather than throwing an error.
+
+If there are lots of fields missing all with similar field names, for example if followup data
+has been omitted and all the followup fields are labelled with a `flw` prefix e.g., `flw_cough`,
+`flw2_fatigue`, this can be specified at the top of the file:
+
+```ini
+[adtl]
+  name = "isaric-core"
+  description = "isaric-core"
+  skipFieldPattern = "flw.*"
+
+[table.sex_at_birth]
+  combinedType = "firstNonNull"
+  excludeWhen = "none"
+  fields = [
+    { field = "sex", values = { 1 = "male", 2 = "female" } },
+    { field = "flw_sex_at_birth", values = { 1 = "male", 2 = "female", 3 = "non_binary" } },
+    { field = "flw2_sex_at_birth", values = { 1 = "male", 2 = "female", 3 = "non_binary" } },
+  ]
+```
+
+Notice that in this case `can_skip` does not need to be added to the fields with a `flw` prefix.
+
 ### Data transformations (apply)
 
 Arbitrary functions can be applied to source fields. adtl ships with a library

diff --git a/schemas/dev.schema.json b/schemas/dev.schema.json
@@ -84,6 +84,10 @@
                   ]
                 }
               }
+            },
+            "can_skip": {
+              "const": true,
+              "description": "Indicates to the parser whether the field can be skipped without throwing an error if missing in the data."
             }
           }
         }

diff --git a/tests/__snapshots__/test_parser.ambr b/tests/__snapshots__/test_parser.ambr
@@ -23,6 +23,22 @@
 
   '''
 # ---
+# name: test_skip_field_pattern_absent
+  '''
+  adtl_valid,adtl_error,cough,epoch,followup_cough,headache,id,text
+  False,data.epoch must be date,1,11/01/1999,,,1,Lorem ipsum
+  False,data.epoch must be date,0,19/12/2022,,,2,example
+
+  '''
+# ---
+# name: test_skip_field_pattern_present
+  '''
+  adtl_valid,adtl_error,cough,epoch,followup_cough,headache,id,text
+  False,data.epoch must be date,1,11/01/1999,0,3,1,Lorem ipsum
+  False,data.epoch must be date,0,19/12/2022,1,0,2,example
+
+  '''
+# ---
 # name: test_validation
   '''
   adtl_valid,adtl_error,admission_date,country_iso3,dataset_id,enrolment_date,ethnicity,sex_at_birth,subject_id

diff --git a/tests/parsers/oneToMany-missingIf.toml b/tests/parsers/oneToMany-missingIf.toml
@@ -1,6 +1,7 @@
 [adtl]
   name = "sampleOneToMany - missingIf"
   description = "One to Many example where if statements are removed"
+  skipFieldPattern = "flw3.*"
 
   [adtl.tables.observation]
     kind = "oneToMany"
@@ -47,3 +48,15 @@
   is_present = { field = "flw2_fever_{n}", values = { 0 = false, 1 = true } }
   # if.any = [ { "flw2_fever_{n}" = 1 }, { "flw2_fever_{n}" = 0 } ]
   for.n.range = [1, 2]
+
+[[observation]]
+  name = "fatigue_malaise"
+  phase = "followup"
+  date = { field = "dt" }
+  is_present = { field = "flw3_fatigue", description = "Fatigue", values = { 1 = true, 0 = false } }
+
+[[observation]]
+  name = "severe_dehydration"
+  phase = "admission"
+  date = { field = "dt" }
+  is_present = { field = "dehydration_vsorres", description = "Severe dehydration:", ref = "Y/N/NK", "can_skip" = true }
diff --git a/tests/parsers/skip_field.json b/tests/parsers/skip_field.json
@@ -0,0 +1,34 @@
+{
+  "adtl": {
+    "name": "allow-skip-field-pattern",
+    "description": "Tests skipping missing fields",
+    "skipFieldPattern": "flw.*",
+    "tables": {
+      "table": {
+        "kind": "oneToOne",
+        "schema": "../schemas/epoch-data.schema.json"
+      }
+    }
+  },
+  "table": {
+    "id": {
+      "field": "Entry_ID"
+    },
+    "epoch": {
+      "field": "Epoch"
+    },
+    "text": {
+      "field": "Text"
+    },
+    "cough": {
+      "field": "cough"
+    },
+    "followup_cough": {
+      "field": "flw_cough"
+    },
+    "headache": {
+      "field": "headache",
+      "can_skip": true
+    }
+  }
+}
diff --git a/tests/schemas/epoch-data.schema.json b/tests/schemas/epoch-data.schema.json
@@ -21,6 +21,12 @@
     },
     "text": {
       "description": "Text field"
+    },
+    "cough": {
+      "description": "Standard cough field"
+    },
+    "followup_cough": {
+      "description": "Follow-up cough field"
     }
   }
-}
+}
diff --git a/tests/schemas/observation_defaultif.schema.json b/tests/schemas/observation_defaultif.schema.json
@@ -58,7 +58,9 @@
                 "headache",
                 "oxygen_saturation",
                 "pao2_sample_type",
-                "history_of_fever"
+                "history_of_fever",
+                "fatigue_malaise",
+                "severe_dehydration"
             ],
             "description": "Observation name"
         }

diff --git a/tests/sources/oneToManyIf.csv b/tests/sources/oneToManyIf.csv
@@ -1,2 +1,2 @@
-dt,dt_1,dt_2,headache_v2,oxy_vsorres,cough_ceoccur_v2,dry_cough_ceoccur_v2,wet_cough_ceoccur_v2,pao2_lbspec,flw2_fever_1,flw2_fever_2
-2022-02-05,2022-02-06,2022-02-07,2,87,3,1,2,3,1,0
+dt,dt_1,dt_2,headache_v2,oxy_vsorres,cough_ceoccur_v2,dry_cough_ceoccur_v2,wet_cough_ceoccur_v2,pao2_lbspec,flw2_fever_1,flw2_fever_2,flw3_fatigue,dehydration_vsorres
+2022-02-05,2022-02-06,2022-02-07,2,87,3,1,2,3,1,0,1,2
diff --git a/tests/sources/skip_field_absent.csv b/tests/sources/skip_field_absent.csv
@@ -0,0 +1,3 @@
+Entry_ID,Epoch,Text,cough
+1,11/01/1999,Lorem ipsum,1
+2,19/12/2022,example,0
diff --git a/tests/sources/skip_field_present.csv b/tests/sources/skip_field_present.csv
@@ -0,0 +1,3 @@
+Entry_ID,Epoch,Text,cough,flw_cough,headache
+1,11/01/1999,Lorem ipsum,1,0,3
+2,19/12/2022,example,0,1,0
-Original file line number
+Diff line change
@@ Expand Up / @@ -84,6 +84,10 @@ @@
                       ]
                     }
                   }
+                },
+                "can_skip": {
+                  "const": true,
+                  "description": "Indicates to the parser whether the field can be skipped without throwing an error if missing in the data."
                 }
               }
             }
@@ Expand Down @@