Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tolerance to row count match #73

Merged
merged 8 commits into from
Mar 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ test-update:
pytest --snapshot-update

test-coverage:
pytest --cov=pointblank --cov-report=term
pytest --cov=pointblank --cov-report=term-missing
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Didn't know about this! But it's a great improvement.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I accidentally left that in the pr 😂


check:
pyright --pythonversion 3.8 pointblank
Expand Down
17 changes: 13 additions & 4 deletions pointblank/_interrogation.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations
from dataclasses import dataclass

from typing import Any
from typing import Any, TYPE_CHECKING

import narwhals as nw
from narwhals.typing import FrameT
Expand All @@ -18,6 +18,8 @@
from pointblank.column import Column, ColumnLiteral
from pointblank.schema import Schema

if TYPE_CHECKING:
from pointblank._typing import AbsoluteTolBounds

@dataclass
class Interrogator:
Expand Down Expand Up @@ -1893,16 +1895,23 @@ class RowCountMatch:
count: int
inverse: bool
threshold: int
abs_tol_bounds : AbsoluteTolBounds
tbl_type: str = "local"

def __post_init__(self):

from pointblank.validate import get_row_count

if not self.inverse:
res = get_row_count(data=self.data_tbl) == self.count
row_count :int = get_row_count(data=self.data_tbl)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it common/better to declare types for variables? I'm honestly not sure what the norm is on this one. I only ask because if it is a preferred approach, we should probably be doing this everywhere (which is what I'm leaning toward).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"best practice" says to type hint everything but it's not necessary. Implementing type checking on a codebase of this size is a giant task, worth it in the long run but not trivial. I'd be happy to do so if you're on board though.

I will say because of how dynamic this code base is, it would probably be even hard than usual, but not at all impossible.

The most bang-for-buck would be to type check function signatures, it's 10% of the work for 90% gain. In my experience, so take it with a grain of salt, most things are caught at the signature level and not the individual variable level.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for weighing in on this. I think we ought to add types to variable declarations! We can both chip away at this, and I think it'll be worth it.


lower_abs_limit, upper_abs_limit = self.abs_tol_bounds
min_val: int = self.count - lower_abs_limit
max_val: int = self.count + upper_abs_limit

if self.inverse:
res : bool = not (row_count >= min_val and row_count <= max_val)
else:
res = get_row_count(data=self.data_tbl) != self.count
res : bool = row_count >= min_val and row_count <= max_val

self.test_unit_res = res

Expand Down
11 changes: 11 additions & 0 deletions pointblank/_typing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@

from __future__ import annotations

## Absolute bounds, ie. plus or minus
type AbsoluteBounds = tuple[int, int]

## Relative bounds, ie. plus or minus some percent
type RelativeBounds = tuple[float, float]

## Tolerance afforded to some check
type Tolerance = int | float | AbsoluteBounds | RelativeBounds
21 changes: 20 additions & 1 deletion pointblank/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re
import inspect

from typing import Any
from typing import Any, TYPE_CHECKING

import narwhals as nw
from narwhals.typing import FrameT
Expand All @@ -12,6 +12,25 @@

from pointblank._constants import ASSERTION_TYPE_METHOD_MAP, GENERAL_COLUMN_TYPES

if TYPE_CHECKING:
from pointblank._typing import AbsoluteBounds, Tolerance

def _derive_single_bound(ref: int, tol : int | float) -> int:
"""Derive a single bound using the reference."""
if not isinstance(tol, float | int):
raise TypeError("Tolerance must be a number or a tuple of numbers.")
if tol < 0:
raise ValueError("Tolerance must be non-negative.")
return int(tol * ref) if tol < 1 else int(tol)

def _derive_bounds(ref: int, tol: Tolerance) -> AbsoluteBounds:
"""Validate and extract the absolute bounds of the tolerance."""
if isinstance(tol, tuple):
return tuple(_derive_single_bound(ref, t) for t in tol)

bound = _derive_single_bound(ref, tol)
return bound, bound


def _get_tbl_type(data: FrameT | Any) -> str:

Expand Down
52 changes: 51 additions & 1 deletion pointblank/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
_is_lib_present,
_is_value_a_df,
_select_df_lib,
_derive_bounds
)
from pointblank._utils_check_args import (
_check_column,
Expand All @@ -75,6 +76,11 @@
)
from pointblank._utils_html import _create_table_type_html, _create_table_dims_html

from typing import TYPE_CHECKING

if TYPE_CHECKING:
from pointblank._typing import Tolerance, AbsoluteBounds

__all__ = [
"Validate",
"load_dataset",
Expand Down Expand Up @@ -4551,6 +4557,7 @@ def col_schema_match(
def row_count_match(
self,
count: int | FrameT | Any,
tol: Tolerance = 0,
inverse: bool = False,
pre: Callable | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
Expand All @@ -4575,6 +4582,14 @@ def row_count_match(
The expected row count of the table. This can be an integer value, a Polars or Pandas
DataFrame object, or an Ibis backend table. If a DataFrame/table is provided, the row
count of that object will be used as the expected count.
tol
The tolerance allowable for the row count match. This can be specified as a single
numeric value (integer or float) or as a tuple of two integers representing the lower
and upper bounds of the tolerance range. If a single integer value (greater than 1) is
provided, it represents the absolute bounds of the tolerance, ie. plus or minus the value.
If a float value (between 0-1) is provided, it represents the relative tolerance, ie.
plus or minus the relative percentage of the target. If a tuple is provided, it represents
the lower and upper absolute bounds of the tolerance range. See the examples for more.
inverse
Should the validation step be inverted? If `True`, then the expectation is that the row
count of the target table should not match the specified `count=` value.
Expand Down Expand Up @@ -4640,6 +4655,37 @@ def row_count_match(

The validation table shows that the expectation value of `13` matches the actual count of
rows in the target table. So, the single test unit passed.


Let's modify our example to show the different ways we can allow some tolerance to our validation
by using the `tol` argument.

```{python}
smaller_small_table = small_table.sample(n = 12) # within the lower bound
validation = (
pb.Validate(data=smaller_small_table)
.row_count_match(count=13,tol=(2, 0)) # minus 2 but plus 0, ie. 11-13
.interrogate()
)

validation

validation = (
pb.Validate(data=smaller_small_table)
.row_count_match(count=13,tol=.05) # .05% tolerance of 13
.interrogate()
)

even_smaller_table = small_table.sample(n = 2)
validation = (
pb.Validate(data=even_smaller_table)
.row_count_match(count=13,tol=5) # plus or minus 5; this test will fail
.interrogate()
)

validation
```

"""

assertion_type = _get_fn_name()
Expand All @@ -4659,8 +4705,11 @@ def row_count_match(
if _is_value_a_df(count) or "ibis.expr.types.relations.Table" in str(type(count)):
count = get_row_count(count)

# Check the integrity of tolerance
bounds: AbsoluteBounds = _derive_bounds(ref = int(count), tol = tol)

# Package up the `count=` and boolean params into a dictionary for later interrogation
values = {"count": count, "inverse": inverse}
values = {"count": count, "inverse": inverse, "abs_tol_bounds": bounds}

val_info = _ValidationInfo(
assertion_type=assertion_type,
Expand Down Expand Up @@ -5158,6 +5207,7 @@ def interrogate(
count=value["count"],
inverse=value["inverse"],
threshold=threshold,
abs_tol_bounds = value["abs_tol_bounds"],
tbl_type=tbl_type,
).get_test_results()

Expand Down
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -122,3 +122,8 @@ testpaths = [

[tool.black]
line-length = 100

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added this coverage exclusion. Hope it's alright!

[tool.coverage.report]
exclude_also = [
"if TYPE_CHECKING:"
]
73 changes: 73 additions & 0 deletions tests/test_validate.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@

from __future__ import annotations

import pathlib

import pprint
import sys
import re
from unittest.mock import patch
import pytest
import random
import itertools
from functools import partial
import contextlib
Expand Down Expand Up @@ -47,6 +51,11 @@
last_n,
)

from typing import TYPE_CHECKING

if TYPE_CHECKING:
from typing import Any


TBL_LIST = [
"tbl_pd",
Expand Down Expand Up @@ -2660,6 +2669,70 @@ def test_row_count_match(request, tbl_fixture):

assert Validate(tbl).row_count_match(count=tbl).interrogate().n_passed(i=1, scalar=True) == 1

@pytest.mark.parametrize(('val','e', 'exc'),
[
((-1, 5), ValueError, 'Tolerance must be non-negative'),
([100, 5], TypeError, 'Tolerance must be a number or a tuple of numbers'),
((5, -1), ValueError, 'Tolerance must be non-negative'),
((None, .05), TypeError, 'Tolerance must be a number or a tuple of numbers'),
(('fooval', 100), TypeError, 'Tolerance must be a number or a tuple of numbers'),
(-1, ValueError, 'Tolerance must be non-negative'),
])
def test_invalid_row_count_tol(val: Any, e: Exception, exc :str) -> None:
data = pl.DataFrame({"foocol": [1, 2, 3]})

with pytest.raises(expected_exception=e, match = exc):
Validate(data=data).row_count_match(count = 3, tol=val)


def test_row_count_example_tol() -> None:
small_table = load_dataset("small_table")
smaller_small_table = small_table.sample(n = 12) # within the lower bound
(
Validate(data=smaller_small_table)
.row_count_match(count=13,tol=(2, 0)) # minus 2 but plus 0, ie. 11-13
.interrogate()
.assert_passing()
)

(
Validate(data=smaller_small_table)
.row_count_match(count=13,tol=.5) # .50% tolerance of 13
.interrogate()
.assert_passing()
)

even_smaller_table = small_table.sample(n = 2)
with pytest.raises(AssertionError):
(
Validate(data=even_smaller_table)
.row_count_match(count=13,tol=5) # plus or minus 5; this test will fail
.interrogate()
.assert_passing()
)

test_row_count_example_tol()

@pytest.mark.parametrize(('nrows','target_count','tol','should_pass'),
[
(98, 100, .05, True),
(98, 100, 5, True),
(104, 100, (5, 5), True),
(0, 100, .05, False),
(0, 100, 5, False),
(0, 100, (5, 5), False),
(98, 100, .95, True),
])
def test_row_count_tol(nrows : int, target_count: int, tol : float | tuple[int, int], should_pass : bool) -> None:
data = pl.DataFrame({"foocol": [random.random()] * nrows})

catcher = contextlib.nullcontext if should_pass else partial(pytest.raises, AssertionError, match = "All tests did not pass")

with catcher():
Validate(data = data).row_count_match(
count = target_count,
tol = tol
).interrogate().assert_passing()

@pytest.mark.parametrize("tbl_fixture", TBL_LIST)
def test_col_count_match(request, tbl_fixture):
Expand Down