Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Skip fields #67

Merged
merged 6 commits into from
May 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 67 additions & 15 deletions adtl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from datetime import datetime
from pathlib import Path
from functools import lru_cache
from typing import Any, Dict, Iterable, List, Optional, Union
from typing import Any, Dict, Iterable, List, Optional, Union, Callable

import pint
import tomli
Expand Down Expand Up @@ -63,7 +63,11 @@ def get_value_unhashed(row: StrDict, rule: Rule, ctx: Context = None) -> Any:
rule, list
): # not a container, is constant
return rule
# Check whether field is present if it's allowed to be passed over
if "field" in rule:
# do not check for condition if field is missing
if skip_field(row, rule, ctx):
return None
# do not parse field if condition is not met
if "if" in rule and not parse_if(row, rule["if"]):
return None
Expand Down Expand Up @@ -141,19 +145,34 @@ def matching_fields(fields: List[str], pattern: str) -> List[str]:
return [f for f in fields if compiled_pattern.match(f)]


def parse_if(row: StrDict, rule: StrDict) -> bool:
def parse_if(
row: StrDict, rule: StrDict, ctx: Callable[[str], dict] = None, can_skip=False
) -> bool:
"Parse conditional statements and return a boolean"

n_keys = len(rule.keys())
assert n_keys == 1
assert n_keys == 1 or n_keys == 2
if n_keys == 2:
assert "can_skip" in rule
can_skip = True
key = next(iter(rule.keys()))
if key == "not" and isinstance(rule[key], dict):
return not parse_if(row, rule[key])
return not parse_if(row, rule[key], ctx, can_skip)
elif key == "any" and isinstance(rule[key], list):
return any(parse_if(row, r) for r in rule[key])
return any(parse_if(row, r, ctx, can_skip) for r in rule[key])
elif key == "all" and isinstance(rule[key], list):
return all(parse_if(row, r) for r in rule[key])
attr_value = row[key]
return all(parse_if(row, r, ctx, can_skip) for r in rule[key])
try:
attr_value = row[key]
except KeyError as e:
if can_skip is True:
return False
elif ctx:
if skip_field(row, {"field": key}, ctx(key)):
return False
else:
raise e

if isinstance(rule[key], dict):
cmp = next(iter(rule[key]))
value = rule[key][cmp]
Expand Down Expand Up @@ -401,6 +420,15 @@ def read_definition(file: Path) -> Dict[str, Any]:
raise ValueError(f"Unsupported file format: {file}")


def skip_field(row: StrDict, rule: StrDict, ctx: Context = None):
"Returns True if the field is missing and allowed to be skipped"
if rule.get("can_skip"):
return rule["field"] not in row
if ctx and ctx.get("skip_pattern") and ctx.get("skip_pattern").match(rule["field"]):
return rule["field"] not in row
return False


class Parser:
def __init__(self, spec: Union[str, Path, StrDict], include_defs: List[str] = []):
"Loads specification from spec in format (default json)"
Expand Down Expand Up @@ -485,6 +513,9 @@ def ctx(self, attribute: str):
"defaultDateFormat": self.header.get(
"defaultDateFormat", DEFAULT_DATE_FORMAT
),
"skip_pattern": re.compile(self.header.get("skipFieldPattern"))
if self.header.get("skipFieldPattern")
else False,
}

def validate_spec(self):
Expand Down Expand Up @@ -548,7 +579,13 @@ def default_if(self, table: str, rule: StrDict):
if "combinedType" not in rule[option]:
field = rule[option]["field"]
if "values" in rule[option]:
if_rule = {"any": [{field: v} for v in rule[option]["values"]]}
values = rule[option]["values"]
if "can_skip" in rule[option]:
if_rule = {"any": [{field: v, "can_skip": True} for v in values]}
else:
if_rule = {"any": [{field: v} for v in values]}
elif "can_skip" in rule[option]:
if_rule = {field: {"!=": ""}, "can_skip": True}
else:
if_rule = {field: {"!=": ""}}
else:
Expand All @@ -560,12 +597,27 @@ def default_if(self, table: str, rule: StrDict):
"list",
], f"Invalid combinedType: {rule[option]['combinedType']}"
rules = rule[option]["fields"]
condition = (
lambda rule: [{rule["field"]: v} for v in rule["values"]]
if "values" in rule
else [{rule["field"]: {"!=": ""}}]
)
if_rule = {"any": sum(map(condition, rules), [])}

def create_if_rule(rule):
field = rule["field"]
values = rule.get("values", [])
can_skip = rule.get("can_skip", False)

if_condition = {}

if values and can_skip:
if_condition = [{field: v, "can_skip": True} for v in values]
elif values:
if_condition = [{field: v} for v in values]
elif can_skip:
if_condition[field] = {"!=": ""}
if_condition["can_skip"] = True
else:
if_condition[field] = {"!=": ""}

return if_condition

if_rule = {"any": sum(map(create_if_rule, rules), [])}

rule["if"] = if_rule
return rule
Expand All @@ -588,7 +640,7 @@ def update_table(self, table: str, row: StrDict):
for match in self.spec[table]:
if "if" not in match:
match = self.default_if(table, match)
if parse_if(row, match["if"]):
if parse_if(row, match["if"], self.ctx):
self.data[table].append(
remove_null_keys(
{
Expand Down
44 changes: 44 additions & 0 deletions docs/specification.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ These metadata fields are defined under a header key `adtl`.
* **defs**: Definitions that can be referred to elsewhere in the schema
* **include-def** (list): List of additional TOML or JSON files to import as
definitions
* **skipFieldPattern** : Regex string matching field names which may be skipped
if not present in a datafile, following the same syntax as `fieldPattern` key.
* **defaultDateFormat**: Default source date format, applied to all fields
with either "date_" / "_date" in the field name or that have format date
set in the JSON schema
Expand Down Expand Up @@ -297,6 +299,48 @@ fields = [

If *excludeWhen* is not set, no exclusions take place and all values are returned as-is.

### Skippable fields

In some cases, a study will be assocaited with multiple data files, all of which have been
filled in to varying degrees. For example, one study site may not provide any follow-up data.

Rather than writing a new parser for every data file with minor differences, parsers can be made
robust to a certain amount of missing data by tagging applicable fields with `can_skip = True`,
for example:

```ini
[[observation]]
name = "cough"
phase = "admission"
date = { field = "admit_date" }
is_present = { field = "cough_ceoccur_v2", description = "Cough", ref = "Y/N/NK", "can_skip" = true }
```

In this case, if adtl does not find `cough_ceoccur_v2` in the data it will skip over the field
and continue, rather than throwing an error.

If there are lots of fields missing all with similar field names, for example if followup data
has been omitted and all the followup fields are labelled with a `flw` prefix e.g., `flw_cough`,
`flw2_fatigue`, this can be specified at the top of the file:

```ini
[adtl]
name = "isaric-core"
description = "isaric-core"
skipFieldPattern = "flw.*"

[table.sex_at_birth]
combinedType = "firstNonNull"
excludeWhen = "none"
fields = [
{ field = "sex", values = { 1 = "male", 2 = "female" } },
{ field = "flw_sex_at_birth", values = { 1 = "male", 2 = "female", 3 = "non_binary" } },
{ field = "flw2_sex_at_birth", values = { 1 = "male", 2 = "female", 3 = "non_binary" } },
]
```

Notice that in this case `can_skip` does not need to be added to the fields with a `flw` prefix.

### Data transformations (apply)

Arbitrary functions can be applied to source fields. adtl ships with a library
Expand Down
4 changes: 4 additions & 0 deletions schemas/dev.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@
]
}
}
},
"can_skip": {
"const": true,
"description": "Indicates to the parser whether the field can be skipped without throwing an error if missing in the data."
}
}
}
Expand Down
16 changes: 16 additions & 0 deletions tests/__snapshots__/test_parser.ambr
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,22 @@

'''
# ---
# name: test_skip_field_pattern_absent
'''
adtl_valid,adtl_error,cough,epoch,followup_cough,headache,id,text
False,data.epoch must be date,1,11/01/1999,,,1,Lorem ipsum
False,data.epoch must be date,0,19/12/2022,,,2,example

'''
# ---
# name: test_skip_field_pattern_present
'''
adtl_valid,adtl_error,cough,epoch,followup_cough,headache,id,text
False,data.epoch must be date,1,11/01/1999,0,3,1,Lorem ipsum
False,data.epoch must be date,0,19/12/2022,1,0,2,example

'''
# ---
# name: test_validation
'''
adtl_valid,adtl_error,admission_date,country_iso3,dataset_id,enrolment_date,ethnicity,sex_at_birth,subject_id
Expand Down
13 changes: 13 additions & 0 deletions tests/parsers/oneToMany-missingIf.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
[adtl]
name = "sampleOneToMany - missingIf"
description = "One to Many example where if statements are removed"
skipFieldPattern = "flw3.*"

[adtl.tables.observation]
kind = "oneToMany"
Expand Down Expand Up @@ -47,3 +48,15 @@
is_present = { field = "flw2_fever_{n}", values = { 0 = false, 1 = true } }
# if.any = [ { "flw2_fever_{n}" = 1 }, { "flw2_fever_{n}" = 0 } ]
for.n.range = [1, 2]

[[observation]]
name = "fatigue_malaise"
phase = "followup"
date = { field = "dt" }
is_present = { field = "flw3_fatigue", description = "Fatigue", values = { 1 = true, 0 = false } }

[[observation]]
name = "severe_dehydration"
phase = "admission"
date = { field = "dt" }
is_present = { field = "dehydration_vsorres", description = "Severe dehydration:", ref = "Y/N/NK", "can_skip" = true }
34 changes: 34 additions & 0 deletions tests/parsers/skip_field.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"adtl": {
"name": "allow-skip-field-pattern",
"description": "Tests skipping missing fields",
"skipFieldPattern": "flw.*",
"tables": {
"table": {
"kind": "oneToOne",
"schema": "../schemas/epoch-data.schema.json"
}
}
},
"table": {
"id": {
"field": "Entry_ID"
},
"epoch": {
"field": "Epoch"
},
"text": {
"field": "Text"
},
"cough": {
"field": "cough"
},
"followup_cough": {
"field": "flw_cough"
},
"headache": {
"field": "headache",
"can_skip": true
}
}
}
8 changes: 7 additions & 1 deletion tests/schemas/epoch-data.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@
},
"text": {
"description": "Text field"
},
"cough": {
"description": "Standard cough field"
},
"followup_cough": {
"description": "Follow-up cough field"
}
}
}
}
4 changes: 3 additions & 1 deletion tests/schemas/observation_defaultif.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,9 @@
"headache",
"oxygen_saturation",
"pao2_sample_type",
"history_of_fever"
"history_of_fever",
"fatigue_malaise",
"severe_dehydration"
],
"description": "Observation name"
}
Expand Down
4 changes: 2 additions & 2 deletions tests/sources/oneToManyIf.csv
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
dt,dt_1,dt_2,headache_v2,oxy_vsorres,cough_ceoccur_v2,dry_cough_ceoccur_v2,wet_cough_ceoccur_v2,pao2_lbspec,flw2_fever_1,flw2_fever_2
2022-02-05,2022-02-06,2022-02-07,2,87,3,1,2,3,1,0
dt,dt_1,dt_2,headache_v2,oxy_vsorres,cough_ceoccur_v2,dry_cough_ceoccur_v2,wet_cough_ceoccur_v2,pao2_lbspec,flw2_fever_1,flw2_fever_2,flw3_fatigue,dehydration_vsorres
2022-02-05,2022-02-06,2022-02-07,2,87,3,1,2,3,1,0,1,2
3 changes: 3 additions & 0 deletions tests/sources/skip_field_absent.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Entry_ID,Epoch,Text,cough
1,11/01/1999,Lorem ipsum,1
2,19/12/2022,example,0
3 changes: 3 additions & 0 deletions tests/sources/skip_field_present.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Entry_ID,Epoch,Text,cough,flw_cough,headache
1,11/01/1999,Lorem ipsum,1,0,3
2,19/12/2022,example,0,1,0
Loading