Skip to content

Commit

Permalink
Feature control checks (#134)
Browse files Browse the repository at this point in the history
* Formated and refactored

* Added control completeness

* Added control checks
  • Loading branch information
canimus authored Dec 29, 2023
1 parent e8b20a2 commit df128f5
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 34 deletions.
35 changes: 10 additions & 25 deletions cuallee/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def validate_data_types(self, rules: List[Rule], dataframe: Any) -> bool:
def summary(self, check: Any, dataframe: Any) -> Any:
"""Computes all predicates and expressions for check summary"""


class Check:
def __init__(
self,
Expand Down Expand Up @@ -627,27 +627,12 @@ def validate(self, dataframe: Any):
), "Invalid data types between rules and dataframe"
return self.compute_engine.summary(self, dataframe)

# def samples(self, dataframe: Any, rule_index: int = None) -> Any:
# if not rule_index:
# return reduce(
# DataFrame.unionAll,
# [dataframe.filter(predicate) for predicate in self.predicates],
# ).drop_duplicates()
# elif isinstance(rule_index, int):
# return reduce(
# DataFrame.unionAll,
# [
# dataframe.filter(predicate)
# for index, predicate in enumerate(self.predicates, 1)
# if rule_index == index
# ],
# )
# elif isinstance(rule_index, list):
# return reduce(
# DataFrame.unionAll,
# [
# dataframe.filter(predicate)
# for index, predicate in enumerate(self.predicates, 1)
# if index in rule_index
# ],
# ).drop_duplicates()


class Control():
@staticmethod
def completeness(dataframe):
"""Control of null values on data frames"""
check = Check(CheckLevel.WARNING, "Completeness")
[check.is_complete(c) for c in dataframe.columns];
return check.validate(dataframe)
2 changes: 1 addition & 1 deletion cuallee/polars_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,7 @@ def _calculate_pass_rate(result, nrows):
if result.imag > nrows:
return nrows / result.imag
else:
return result.imag / nrows
return nrows / (nrows + result.imag)
else:
return 1.0

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "cuallee"
version = "0.6.1"
version = "0.7.0"
authors = [
{ name="Herminio Vazquez", email="canimus@gmail.com"},
{ name="Virginie Grosboillot", email="vestalisvirginis@gmail.com" }
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[metadata]
name = cuallee
version = 0.6.1
version = 0.7.0
[options]
packages = find:
9 changes: 9 additions & 0 deletions test/unit/class_control/test_methods.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from cuallee import Control
import pandas as pd

def test_has_completeness():
assert hasattr(Control, "completeness")

def test_completeness_result():
df = pd.DataFrame({"A" : [1,2,3,4,5]})
assert Control.completeness(df).status.eq("PASS").all()
12 changes: 6 additions & 6 deletions test/unit/polars_dataframe/test_is_daily.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,23 @@ def test_positive(check: Check):
df = pl.DataFrame({
"id": pl.date_range(start=date(2022,11,20), end=date(2022,11,26), interval="1d", eager=True)
})
result = check.validate(df).select(pl.col("status")) == "PASS"
assert all(result.to_series().to_list())

assert check.validate(df).select(pl.col("status").eq("PASS").all()).item()


def test_negative(check: Check):
check.is_daily("id")
df = pl.DataFrame(
{"id": [datetime.today() + timedelta(days=i) for i in range(1, 10, 2)]}
)
result = check.validate(df).select(pl.col("status")) == "FAIL"
assert all(result.to_series().to_list())
assert check.validate(df).select(pl.col("status").eq("FAIL").all()).item()


def test_coverage(check: Check):
check.is_daily("id", pct=0.6)
df = pl.DataFrame(
{"id": [datetime(2022, 12, 12) + timedelta(days=i) for i in range(1, 10, 2)]}
)
result = check.validate(df).select(pl.col("status")) == "PASS"
assert all(result.to_series().to_list())

assert check.validate(df).select(pl.col("status").eq("PASS").all()).item()

0 comments on commit df128f5

Please sign in to comment.