Skip to content

Commit

Permalink
qc: add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
abhidg committed Oct 9, 2023
1 parent 806dc8e commit 3f02fe2
Show file tree
Hide file tree
Showing 5 changed files with 118 additions and 4 deletions.
9 changes: 7 additions & 2 deletions adtl/qc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,10 @@ class WorkUnitResult(TypedDict):
rows_success: int
rows_fail: int
ratio_success: float
rows_fail_idx: List[int]
success: bool
mostly: float
series: List[Dict[str, Any]]
fail_data: pd.DataFrame


def rules_for(pattern: str, *rules):
Expand Down Expand Up @@ -68,6 +69,9 @@ def wrapper(df, **kwargs):
for c in set(columns) - set(df.columns):
df[c] = None
series = func(df, **kwargs)
assert len(series) == len(df), \
"Returned series must have same cardinality as source dataframe"
rows_fail_idx = [i for i, val in enumerate(series) if val is False]
if isinstance(series, (pd.Series, np.ndarray)):
rows_success: int = series.sum()
rows_fail = len(series) - rows_success
Expand All @@ -78,7 +82,8 @@ def wrapper(df, **kwargs):
ratio_success=ratio_success,
success=ratio_success >= mostly,
mostly=mostly,
series=series,
rows_fail_idx=rows_fail_idx,
fail_data=df.loc[rows_fail_idx][columns],
)
elif isinstance(series, bool):
return dict(
Expand Down
3 changes: 1 addition & 2 deletions adtl/qc/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
DEFAULT_PATTERN = "*.csv"



def collect_datasets(
root: Path = Path("."), file_formats: List[str] = ["csv"]
) -> List[Dataset]:
Expand Down Expand Up @@ -81,7 +80,7 @@ def process_work_unit(unit: WorkUnit) -> WorkUnitResult:
rule_function = getattr(module, rule["name"])

# TODO: assumes file is CSV, should be a generic reader
result = rule_function(pd.read_csv(unit["file"], dtype=str))
result = rule_function(pd.read_csv(unit["file"]))
result.update(
dict(rule=unit["rule"]["name"], dataset=unit["dataset"], file=unit["file"])
)
Expand Down
11 changes: 11 additions & 0 deletions tests/data/dataset/test-subject.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
sex_at_birth,pregnancy
male,
female,False
male,False
male,
male,
female,True
female,
male,True
male,
female,True
18 changes: 18 additions & 0 deletions tests/qc/pregnancy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"Pregnancy related checks"

from adtl.qc import rule, rules_for
import pandas as pd


@rule(columns=["sex", "sex_at_birth", "pregnancy"])
def rule_male_patients_not_pregnant(df: pd.DataFrame) -> pd.Series:
"Male patients are not pregnant"
return (
(df.sex == "female")
| (df.sex_at_birth == "female")
| (((df.sex_at_birth == "male") | (df.sex == "male")) & df.pregnancy != True)
| (df.sex.isnull() & df.sex_at_birth.isnull())
)


rules_for("*-subject.csv", rule_male_patients_not_pregnant)
81 changes: 81 additions & 0 deletions tests/test_qc_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""
Tests for QC runner
"""

import os
import pandas as pd
from pathlib import Path

import pytest
from adtl.qc.runner import (
collect_datasets,
collect_rules,
collect_work_units,
process_work_unit,
)


def test_collect_datasets():
dataset = collect_datasets(Path(__file__).parent / "data")[0]
assert dataset["dataset"] == "dataset"
assert str(dataset["files"][0]).endswith("data/dataset/test-subject.csv")


def test_collect_rules():
cwd = os.getcwd()
os.chdir(Path(__file__).parent)
rule = collect_rules(Path("qc"))[0]
assert rule == {
"description": "Male patients are not pregnant",
"module": "qc.pregnancy",
"name": "rule_male_patients_not_pregnant",
"pattern": "*-subject.csv",
}
os.chdir(cwd)


@pytest.fixture
def work_unit():
datasets = collect_datasets(Path(__file__).parent / "data")
cwd = os.getcwd()
os.chdir(Path(__file__).parent)
rules = collect_rules(Path("qc"))
_work_unit = collect_work_units(datasets, rules)[0]
os.chdir(cwd)
return _work_unit


def test_collect_work_units(work_unit):
assert work_unit["dataset"]["dataset"] == "dataset"
assert str(work_unit["dataset"]["files"][0]).endswith(
"data/dataset/test-subject.csv"
)
assert str(work_unit["file"]).endswith("data/dataset/test-subject.csv")
assert work_unit["rule"] == {
"description": "Male patients are not pregnant",
"module": "qc.pregnancy",
"name": "rule_male_patients_not_pregnant",
"pattern": "*-subject.csv",
}


def test_process_work_unit(work_unit):
result = process_work_unit(work_unit)
assert result["dataset"]["dataset"] == "dataset"
assert str(work_unit["dataset"]["files"][0]).endswith(
"data/dataset/test-subject.csv"
)
assert str(work_unit["file"]).endswith("data/dataset/test-subject.csv")
assert (
dict(
mostly=0,
ratio_success=0.9,
rows_fail=1,
rows_success=9,
rule="rule_male_patients_not_pregnant",
rows_fail_idx=[7],
).items()
<= result.items()
)
df = pd.DataFrame({"sex_at_birth": ["male"], "sex": [None], "pregnancy": [True]})
assert df.to_dict(orient="records") == result["fail_data"].to_dict(orient="records")

0 comments on commit 3f02fe2

Please sign in to comment.