Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add returnUnmatched to spec options #96

Merged
merged 6 commits into from
Oct 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 37 additions & 16 deletions adtl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@
import requests
import fastjsonschema
from tqdm import tqdm
import warnings

import adtl.transformations as tf
from adtl.transformations import AdtlTransformationWarning

SUPPORTED_FORMATS = {"json": json.load, "toml": tomli.load}
DEFAULT_DATE_FORMAT = "%Y-%m-%d"
Expand Down Expand Up @@ -77,6 +79,7 @@ def get_value_unhashed(row: StrDict, rule: Rule, ctx: Context = None) -> Any:
if "apply" in rule:
# apply data transformations.
transformation = rule["apply"]["function"]
params = None
if "params" in rule["apply"]:
params = []
for i in range(len(rule["apply"]["params"])):
Expand All @@ -100,21 +103,25 @@ def get_value_unhashed(row: StrDict, rule: Rule, ctx: Context = None) -> Any:
else:
params.append(rule["apply"]["params"][i])

try:
value = getattr(tf, transformation)(value, *params)
except AttributeError:
raise AttributeError(
f"Error using a data transformation: Function {transformation} "
"has not been defined."
)
else:
try:
value = getattr(tf, transformation)(value)
except AttributeError:
raise AttributeError(
f"Error using a data transformation: Function {transformation} "
"has not been defined."
)
try:
with warnings.catch_warnings():
warnings.simplefilter("error", category=AdtlTransformationWarning)
if params:
value = getattr(tf, transformation)(value, *params)
else:
value = getattr(tf, transformation)(value)
except AttributeError:
raise AttributeError(
f"Error using a data transformation: Function {transformation} "
"has not been defined."
)
except AdtlTransformationWarning as e:
if ctx and ctx.get("returnUnmatched"):
warnings.warn(str(e), AdtlTransformationWarning)
return value
else:
logging.error(str(e))
return None
return value
if value == "":
return None
Expand All @@ -123,10 +130,14 @@ def get_value_unhashed(row: StrDict, rule: Rule, ctx: Context = None) -> Any:
value = value.lower()
rule["values"] = {k.lower(): v for k, v in rule["values"].items()}

if rule.get("ignoreMissingKey"):
if rule.get("ignoreMissingKey") or (ctx and ctx.get("returnUnmatched")):
value = rule["values"].get(value, value)
else:
value = rule["values"].get(value)

# recheck if value is empty after mapping (use to map values to None)
if value == "":
return None
# Either source_unit / unit OR source_date / date triggers conversion
# do not parse units if value is empty
if "source_unit" in rule and "unit" in rule:
Expand All @@ -142,6 +153,9 @@ def get_value_unhashed(row: StrDict, rule: Rule, ctx: Context = None) -> Any:
try:
value = pint.Quantity(float(value), source_unit).to(unit).m
except ValueError:
if ctx and ctx.get("returnUnmatched"):
logging.debug(f"Could not convert {value} to a floating point")
return value
raise ValueError(f"Could not convert {value} to a floating point")
if "source_date" in rule or (ctx and ctx.get("is_date")):
assert "source_unit" not in rule and "unit" not in rule
Expand All @@ -156,6 +170,8 @@ def get_value_unhashed(row: StrDict, rule: Rule, ctx: Context = None) -> Any:
value = datetime.strptime(value, source_date).strftime(target_date)
except (TypeError, ValueError):
logging.info(f"Could not parse date: {value}")
if ctx and ctx.get("returnUnmatched"):
return value
return None
return value
elif "combinedType" in rule:
Expand Down Expand Up @@ -609,6 +625,7 @@ def ctx(self, attribute: str):
if self.header.get("skipFieldPattern")
else False
),
"returnUnmatched": self.header.get("returnUnmatched", False),
}

def validate_spec(self):
Expand Down Expand Up @@ -1042,6 +1059,10 @@ def main(argv=None):
include_defs = args.include_def or []
spec = Parser(args.spec, include_defs=include_defs, quiet=args.quiet)

# check for incompatible options
if spec.header.get("returnUnmatched") and args.parquet:
raise ValueError("returnUnmatched and parquet options are incompatible")

# run adtl
adtl_output = spec.parse(args.file, encoding=args.encoding)
adtl_output.save(args.output or spec.name, args.parquet)
Expand Down
73 changes: 55 additions & 18 deletions adtl/transformations.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
""" Functions which can be applied to source fields, allowing extensibility """

import logging
from typing import Any, Optional, List
from datetime import datetime, timedelta, date

Expand All @@ -17,6 +16,12 @@

from typing import Literal, Union

import warnings


class AdtlTransformationWarning(UserWarning):
pass


def isNotNull(value: Optional[str]) -> bool:
"Returns whether value is not null or an empty string"
Expand Down Expand Up @@ -55,6 +60,12 @@ def wordSubstituteSet(value: str, *params) -> List[str]:
for match, subst in sub_map.items():
if re.search(r"\b" + match + r"\b", value, re.IGNORECASE):
out.append(subst)
if not out and (value not in [None, ""]):
warnings.warn(
f"No matches found for: '{value}'",
AdtlTransformationWarning,
stacklevel=2,
)
return sorted(set(out)) if out else None


Expand Down Expand Up @@ -118,18 +129,19 @@ def yearsElapsed(
bd_format: str = "%Y-%m-%d",
cd_format: str = "%Y-%m-%d",
):
"""Returns the number of years elapsed between two dates, useful for calculating ages
"""
Returns the number of years elapsed between two dates, useful for calculating ages

Args:
birthdate: Start date of duration
currentdate: End date of duration
epoch: Epoch year after which dates will be converted to the last century.
As an example, if epoch is 2022, then the date 1/1/23 will be converted
to the January 1, 1923.
bd_format: Date format for *birthdate* specified using :manpage:`strftime(3)` conventions.
Defaults to ISO format ("%Y-%m-%d")
cd_format: Date format for *currentdate* specified using :manpage:`strftime(3)` conventions.
Defaults to ISO format ("%Y-%m-%d")
bd_format: Date format for *birthdate* specified using :manpage:`strftime(3)`
conventions. Defaults to ISO format ("%Y-%m-%d")
cd_format: Date format for *currentdate* specified using :manpage:`strftime(3)`
conventions. Defaults to ISO format ("%Y-%m-%d")

Returns:
int | None: Number of years elapsed or None if invalid dates were encountered
Expand All @@ -145,8 +157,15 @@ def yearsElapsed(

cd = datetime.strptime(currentdate, cd_format)

days = cd - bd
return pint.Quantity(days.days, "days").to("years").m
try:
days = cd - bd
return pint.Quantity(days.days, "days").to("years").m
except ValueError:
warnings.warn(
f"Failed calculation yearsElapsed: {birthdate}, {currentdate}",
AdtlTransformationWarning,
stacklevel=2,
)


def durationDays(startdate: str, currentdate: str) -> int:
Expand Down Expand Up @@ -210,15 +229,19 @@ def makeDate(year: str, month: str, day: str) -> str:
try:
year, month, day = int(year), int(month), int(day)
except ValueError:
logging.error(
f"Error in casting to integer: year={year}, month={month}, day={day}"
warnings.warn(
f"Could not construct date from: year={year}, month={month}, day={day}",
AdtlTransformationWarning,
stacklevel=2,
)
return None
try:
return date(year, month, day).isoformat()
except ValueError:
logging.error(
f"Could not construct date from: year={year}, month={month}, day={day}"
warnings.warn(
f"Could not construct date from: year={year}, month={month}, day={day}",
AdtlTransformationWarning,
stacklevel=2,
)
return None

Expand All @@ -245,8 +268,10 @@ def makeDateTimeFromSeconds(
tzinfo=zoneinfo.ZoneInfo(timezone)
)
except ValueError:
logging.error(
f"Could not convert date {date!r} from date format {date_format!r}"
warnings.warn(
f"Could not convert date {date!r} from date format {date_format!r}",
AdtlTransformationWarning,
stacklevel=2,
)
return None
if time_seconds == "":
Expand Down Expand Up @@ -279,8 +304,10 @@ def makeDateTime(
tzinfo=zoneinfo.ZoneInfo(timezone)
)
except ValueError:
logging.error(
f"Could not convert date {date!r} from date format {date_format!r}"
warnings.warn(
f"Could not convert date {date!r} from date format {date_format!r}",
AdtlTransformationWarning,
stacklevel=2,
)
return None

Expand Down Expand Up @@ -315,6 +342,11 @@ def splitDate(
elif option == "day":
return sd.day
else:
warnings.warn(
f"Invalid option {option!r} for splitDate",
AdtlTransformationWarning,
stacklevel=2,
)
return None


Expand All @@ -330,7 +362,8 @@ def startYear(
Use to calculate year e.g. of birth from date (e.g. current date) and
duration (e.g. age)

The date can be provided as a list of possible dates (if a hierarchy needs searching through)
The date can be provided as a list of possible dates (if a hierarchy needs
searching through)

Args:
duration: Duration value
Expand Down Expand Up @@ -442,7 +475,11 @@ def correctOldDate(date: str, epoch: float, format: str, return_datetime: bool =
try:
cd = datetime.strptime(date, format)
except ValueError:
logging.error(f"Could not convert date {date!r} from date format {format!r}")
warnings.warn(
f"Could not convert date {date!r} from date format {format!r}",
AdtlTransformationWarning,
stacklevel=2,
)
return None

if cd.year >= epoch and "y" in format:
Expand Down
7 changes: 7 additions & 0 deletions docs/specification.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,13 @@ if not present in a datafile, following the same syntax as `fieldPattern` key.
* **defaultDateFormat**: Default source date format, applied to all fields
with either "date_" / "_date" in the field name or that have format date
set in the JSON schema
* **returnUnmatched**: Returns all values that are not able to be converted
according to the provided rules and formats. For fields with [value mappings](#field-with-value-mapping), it is equivalent to using `ignoreMissingKeys`. Fields using [data transformation functions](#data-transformations-(apply)) will issue a warning to the
terminal describing the error in the transformation. Transformations requiring multiple
parameters will only return the current field value that was not transformed.
> :warning: This is likely to return columns with non-matching datatypes. External json
validation may fail. This option is incompatible with the `--parquet` option to save
outputs as parquet files (which required a consistent type down each column).

## Validation

Expand Down
7 changes: 7 additions & 0 deletions tests/__snapshots__/test_parser.ambr
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,13 @@

'''
# ---
# name: test_return_unmapped
'''
age,date_death,date_of_birth,diabetes_type,has_smoking,pregnancy_birth_weight_kg,subject_id
fifteen,2023,2023-11-20,no diabetes,today,eight,1

'''
# ---
# name: test_show_report
'''

Expand Down
46 changes: 46 additions & 0 deletions tests/parsers/return-unmapped.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
[adtl]
name = "test-return-unmapped"
description = "Example using returnUnmatched to return unmapped fields"
returnUnmatched = true

[adtl.tables.subject]
kind = "groupBy"
groupBy = "subject_id"
aggregation = "lastNotNull"

[subject]

[subject.subject_id]
field = "subjid"
description = "Subject ID"

[subject.date_of_birth]
field = "first_admit"
source_date = "%m"

[subject.age]
field = "age"
apply = {function = "getFloat"}

[subject.pregnancy_birth_weight_kg]
field = "weight"
unit = "kg"
source_unit = "lbs"

[subject.has_smoking]
field = "smoking"
values = { 1 = "current", 2 = "never", 3 = "former" }

[subject.diabetes_type]
field = "diabetes_type"

apply.function = "wordSubstituteSet"
apply.params = [
["type[\\s\\-]?1", "type-1"],
["type[\\s\\-]?2", "type-2"]
]

[subject.date_death]
field = "death_year"
apply.function = "makeDate"
apply.params = ["$death_month", "$death_day"]
2 changes: 2 additions & 0 deletions tests/sources/return-unmapped.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
subjid,redcap,first_admit,age,weight,smoking,diabetes_type,death_year,death_month,death_day
1,admit,2023-11-20,fifteen,eight,today,no diabetes,2023,11,80
25 changes: 25 additions & 0 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1227,6 +1227,22 @@ def test_main_parquet():
Path("output-table.parquet").unlink()


def test_main_parquet_error():
ARG = [
str(TEST_PARSERS_PATH / "return-unmapped.toml"),
str(TEST_SOURCES_PATH / "return-unmapped.csv"),
"-o",
"output",
"--encoding",
"utf-8",
]

with pytest.raises(
ValueError, match="returnUnmatched and parquet options are incompatible"
):
parser.main(ARG + ["--parquet"])


@responses.activate
def test_main_web_schema(snapshot):
# test with schema on the web
Expand Down Expand Up @@ -1360,3 +1376,12 @@ def test_no_overwriting():
.read_table("visit")
)
assert overwriting_output == OVERWRITE_OUTPUT


def test_return_unmapped(snapshot):
transformed_csv_data = (
parser.Parser(TEST_PARSERS_PATH / "return-unmapped.toml")
.parse(TEST_SOURCES_PATH / "return-unmapped.csv")
.write_csv("subject")
)
assert transformed_csv_data == snapshot
Loading