Skip to content

Commit

Permalink
Add returnUnmatched to spec options (#96)
Browse files Browse the repository at this point in the history
* Add 'returnUnmatched' option to return original value

* Add check for incompatible 'parquet' and 'returnUnmatched' options
  • Loading branch information
pipliggins authored Oct 7, 2024
1 parent 8e4c96d commit 5b8498c
Show file tree
Hide file tree
Showing 7 changed files with 179 additions and 34 deletions.
53 changes: 37 additions & 16 deletions adtl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@
import requests
import fastjsonschema
from tqdm import tqdm
import warnings

import adtl.transformations as tf
from adtl.transformations import AdtlTransformationWarning

SUPPORTED_FORMATS = {"json": json.load, "toml": tomli.load}
DEFAULT_DATE_FORMAT = "%Y-%m-%d"
Expand Down Expand Up @@ -77,6 +79,7 @@ def get_value_unhashed(row: StrDict, rule: Rule, ctx: Context = None) -> Any:
if "apply" in rule:
# apply data transformations.
transformation = rule["apply"]["function"]
params = None
if "params" in rule["apply"]:
params = []
for i in range(len(rule["apply"]["params"])):
Expand All @@ -100,21 +103,25 @@ def get_value_unhashed(row: StrDict, rule: Rule, ctx: Context = None) -> Any:
else:
params.append(rule["apply"]["params"][i])

try:
value = getattr(tf, transformation)(value, *params)
except AttributeError:
raise AttributeError(
f"Error using a data transformation: Function {transformation} "
"has not been defined."
)
else:
try:
value = getattr(tf, transformation)(value)
except AttributeError:
raise AttributeError(
f"Error using a data transformation: Function {transformation} "
"has not been defined."
)
try:
with warnings.catch_warnings():
warnings.simplefilter("error", category=AdtlTransformationWarning)
if params:
value = getattr(tf, transformation)(value, *params)
else:
value = getattr(tf, transformation)(value)
except AttributeError:
raise AttributeError(
f"Error using a data transformation: Function {transformation} "
"has not been defined."
)
except AdtlTransformationWarning as e:
if ctx and ctx.get("returnUnmatched"):
warnings.warn(str(e), AdtlTransformationWarning)
return value
else:
logging.error(str(e))
return None
return value
if value == "":
return None
Expand All @@ -123,10 +130,14 @@ def get_value_unhashed(row: StrDict, rule: Rule, ctx: Context = None) -> Any:
value = value.lower()
rule["values"] = {k.lower(): v for k, v in rule["values"].items()}

if rule.get("ignoreMissingKey"):
if rule.get("ignoreMissingKey") or (ctx and ctx.get("returnUnmatched")):
value = rule["values"].get(value, value)
else:
value = rule["values"].get(value)

# recheck if value is empty after mapping (use to map values to None)
if value == "":
return None
# Either source_unit / unit OR source_date / date triggers conversion
# do not parse units if value is empty
if "source_unit" in rule and "unit" in rule:
Expand All @@ -142,6 +153,9 @@ def get_value_unhashed(row: StrDict, rule: Rule, ctx: Context = None) -> Any:
try:
value = pint.Quantity(float(value), source_unit).to(unit).m
except ValueError:
if ctx and ctx.get("returnUnmatched"):
logging.debug(f"Could not convert {value} to a floating point")
return value
raise ValueError(f"Could not convert {value} to a floating point")
if "source_date" in rule or (ctx and ctx.get("is_date")):
assert "source_unit" not in rule and "unit" not in rule
Expand All @@ -156,6 +170,8 @@ def get_value_unhashed(row: StrDict, rule: Rule, ctx: Context = None) -> Any:
value = datetime.strptime(value, source_date).strftime(target_date)
except (TypeError, ValueError):
logging.info(f"Could not parse date: {value}")
if ctx and ctx.get("returnUnmatched"):
return value
return None
return value
elif "combinedType" in rule:
Expand Down Expand Up @@ -609,6 +625,7 @@ def ctx(self, attribute: str):
if self.header.get("skipFieldPattern")
else False
),
"returnUnmatched": self.header.get("returnUnmatched", False),
}

def validate_spec(self):
Expand Down Expand Up @@ -1042,6 +1059,10 @@ def main(argv=None):
include_defs = args.include_def or []
spec = Parser(args.spec, include_defs=include_defs, quiet=args.quiet)

# check for incompatible options
if spec.header.get("returnUnmatched") and args.parquet:
raise ValueError("returnUnmatched and parquet options are incompatible")

# run adtl
adtl_output = spec.parse(args.file, encoding=args.encoding)
adtl_output.save(args.output or spec.name, args.parquet)
Expand Down
73 changes: 55 additions & 18 deletions adtl/transformations.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
""" Functions which can be applied to source fields, allowing extensibility """

import logging
from typing import Any, Optional, List
from datetime import datetime, timedelta, date

Expand All @@ -17,6 +16,12 @@

from typing import Literal, Union

import warnings


class AdtlTransformationWarning(UserWarning):
pass


def isNotNull(value: Optional[str]) -> bool:
"Returns whether value is not null or an empty string"
Expand Down Expand Up @@ -55,6 +60,12 @@ def wordSubstituteSet(value: str, *params) -> List[str]:
for match, subst in sub_map.items():
if re.search(r"\b" + match + r"\b", value, re.IGNORECASE):
out.append(subst)
if not out and (value not in [None, ""]):
warnings.warn(
f"No matches found for: '{value}'",
AdtlTransformationWarning,
stacklevel=2,
)
return sorted(set(out)) if out else None


Expand Down Expand Up @@ -118,18 +129,19 @@ def yearsElapsed(
bd_format: str = "%Y-%m-%d",
cd_format: str = "%Y-%m-%d",
):
"""Returns the number of years elapsed between two dates, useful for calculating ages
"""
Returns the number of years elapsed between two dates, useful for calculating ages
Args:
birthdate: Start date of duration
currentdate: End date of duration
epoch: Epoch year after which dates will be converted to the last century.
As an example, if epoch is 2022, then the date 1/1/23 will be converted
to the January 1, 1923.
bd_format: Date format for *birthdate* specified using :manpage:`strftime(3)` conventions.
Defaults to ISO format ("%Y-%m-%d")
cd_format: Date format for *currentdate* specified using :manpage:`strftime(3)` conventions.
Defaults to ISO format ("%Y-%m-%d")
bd_format: Date format for *birthdate* specified using :manpage:`strftime(3)`
conventions. Defaults to ISO format ("%Y-%m-%d")
cd_format: Date format for *currentdate* specified using :manpage:`strftime(3)`
conventions. Defaults to ISO format ("%Y-%m-%d")
Returns:
int | None: Number of years elapsed or None if invalid dates were encountered
Expand All @@ -145,8 +157,15 @@ def yearsElapsed(

cd = datetime.strptime(currentdate, cd_format)

days = cd - bd
return pint.Quantity(days.days, "days").to("years").m
try:
days = cd - bd
return pint.Quantity(days.days, "days").to("years").m
except ValueError:
warnings.warn(
f"Failed calculation yearsElapsed: {birthdate}, {currentdate}",
AdtlTransformationWarning,
stacklevel=2,
)


def durationDays(startdate: str, currentdate: str) -> int:
Expand Down Expand Up @@ -210,15 +229,19 @@ def makeDate(year: str, month: str, day: str) -> str:
try:
year, month, day = int(year), int(month), int(day)
except ValueError:
logging.error(
f"Error in casting to integer: year={year}, month={month}, day={day}"
warnings.warn(
f"Could not construct date from: year={year}, month={month}, day={day}",
AdtlTransformationWarning,
stacklevel=2,
)
return None
try:
return date(year, month, day).isoformat()
except ValueError:
logging.error(
f"Could not construct date from: year={year}, month={month}, day={day}"
warnings.warn(
f"Could not construct date from: year={year}, month={month}, day={day}",
AdtlTransformationWarning,
stacklevel=2,
)
return None

Expand All @@ -245,8 +268,10 @@ def makeDateTimeFromSeconds(
tzinfo=zoneinfo.ZoneInfo(timezone)
)
except ValueError:
logging.error(
f"Could not convert date {date!r} from date format {date_format!r}"
warnings.warn(
f"Could not convert date {date!r} from date format {date_format!r}",
AdtlTransformationWarning,
stacklevel=2,
)
return None
if time_seconds == "":
Expand Down Expand Up @@ -279,8 +304,10 @@ def makeDateTime(
tzinfo=zoneinfo.ZoneInfo(timezone)
)
except ValueError:
logging.error(
f"Could not convert date {date!r} from date format {date_format!r}"
warnings.warn(
f"Could not convert date {date!r} from date format {date_format!r}",
AdtlTransformationWarning,
stacklevel=2,
)
return None

Expand Down Expand Up @@ -315,6 +342,11 @@ def splitDate(
elif option == "day":
return sd.day
else:
warnings.warn(
f"Invalid option {option!r} for splitDate",
AdtlTransformationWarning,
stacklevel=2,
)
return None


Expand All @@ -330,7 +362,8 @@ def startYear(
Use to calculate year e.g. of birth from date (e.g. current date) and
duration (e.g. age)
The date can be provided as a list of possible dates (if a hierarchy needs searching through)
The date can be provided as a list of possible dates (if a hierarchy needs
searching through)
Args:
duration: Duration value
Expand Down Expand Up @@ -442,7 +475,11 @@ def correctOldDate(date: str, epoch: float, format: str, return_datetime: bool =
try:
cd = datetime.strptime(date, format)
except ValueError:
logging.error(f"Could not convert date {date!r} from date format {format!r}")
warnings.warn(
f"Could not convert date {date!r} from date format {format!r}",
AdtlTransformationWarning,
stacklevel=2,
)
return None

if cd.year >= epoch and "y" in format:
Expand Down
7 changes: 7 additions & 0 deletions docs/specification.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,13 @@ if not present in a datafile, following the same syntax as `fieldPattern` key.
* **defaultDateFormat**: Default source date format, applied to all fields
with either "date_" / "_date" in the field name or that have format date
set in the JSON schema
* **returnUnmatched**: Returns all values that are not able to be converted
according to the provided rules and formats. For fields with [value mappings](#field-with-value-mapping), it is equivalent to using `ignoreMissingKeys`. Fields using [data transformation functions](#data-transformations-(apply)) will issue a warning to the
terminal describing the error in the transformation. Transformations requiring multiple
parameters will only return the current field value that was not transformed.
> :warning: This is likely to return columns with non-matching datatypes. External json
validation may fail. This option is incompatible with the `--parquet` option to save
outputs as parquet files (which required a consistent type down each column).

## Validation

Expand Down
7 changes: 7 additions & 0 deletions tests/__snapshots__/test_parser.ambr
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,13 @@

'''
# ---
# name: test_return_unmapped
'''
age,date_death,date_of_birth,diabetes_type,has_smoking,pregnancy_birth_weight_kg,subject_id
fifteen,2023,2023-11-20,no diabetes,today,eight,1

'''
# ---
# name: test_show_report
'''

Expand Down
46 changes: 46 additions & 0 deletions tests/parsers/return-unmapped.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
[adtl]
name = "test-return-unmapped"
description = "Example using returnUnmatched to return unmapped fields"
returnUnmatched = true

[adtl.tables.subject]
kind = "groupBy"
groupBy = "subject_id"
aggregation = "lastNotNull"

[subject]

[subject.subject_id]
field = "subjid"
description = "Subject ID"

[subject.date_of_birth]
field = "first_admit"
source_date = "%m"

[subject.age]
field = "age"
apply = {function = "getFloat"}

[subject.pregnancy_birth_weight_kg]
field = "weight"
unit = "kg"
source_unit = "lbs"

[subject.has_smoking]
field = "smoking"
values = { 1 = "current", 2 = "never", 3 = "former" }

[subject.diabetes_type]
field = "diabetes_type"

apply.function = "wordSubstituteSet"
apply.params = [
["type[\\s\\-]?1", "type-1"],
["type[\\s\\-]?2", "type-2"]
]

[subject.date_death]
field = "death_year"
apply.function = "makeDate"
apply.params = ["$death_month", "$death_day"]
2 changes: 2 additions & 0 deletions tests/sources/return-unmapped.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
subjid,redcap,first_admit,age,weight,smoking,diabetes_type,death_year,death_month,death_day
1,admit,2023-11-20,fifteen,eight,today,no diabetes,2023,11,80
25 changes: 25 additions & 0 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1227,6 +1227,22 @@ def test_main_parquet():
Path("output-table.parquet").unlink()


def test_main_parquet_error():
ARG = [
str(TEST_PARSERS_PATH / "return-unmapped.toml"),
str(TEST_SOURCES_PATH / "return-unmapped.csv"),
"-o",
"output",
"--encoding",
"utf-8",
]

with pytest.raises(
ValueError, match="returnUnmatched and parquet options are incompatible"
):
parser.main(ARG + ["--parquet"])


@responses.activate
def test_main_web_schema(snapshot):
# test with schema on the web
Expand Down Expand Up @@ -1360,3 +1376,12 @@ def test_no_overwriting():
.read_table("visit")
)
assert overwriting_output == OVERWRITE_OUTPUT


def test_return_unmapped(snapshot):
transformed_csv_data = (
parser.Parser(TEST_PARSERS_PATH / "return-unmapped.toml")
.parse(TEST_SOURCES_PATH / "return-unmapped.csv")
.write_csv("subject")
)
assert transformed_csv_data == snapshot

0 comments on commit 5b8498c

Please sign in to comment.