Skip to content

Commit

Permalink
internals rewrite: clean up checks and hypothesis functionality (#1109)
Browse files Browse the repository at this point in the history
* rename core.pandas.checks to *.builtin_checks

* handle strategies better

* wip re-implement builtin checks

* clean up hypotheses

* add docstrings

* move builtin checks/hypotheses modules

* clean up register_{check, hypothesis}

* minor import cleanup

* clean up check registration, error implementation

* create backends.base subpackage

* remove debugging script

* cleanup, fix codecov
  • Loading branch information
cosmicBboy authored Mar 13, 2023
1 parent 0ef450b commit d6c2078
Show file tree
Hide file tree
Showing 32 changed files with 1,311 additions and 819 deletions.
7 changes: 7 additions & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@ good-names=
fp,
bar,
_IS_INFERRED,
eq,
ne,
gt,
ge,
lt,
le,
dt

[MESSAGES CONTROL]
disable=
Expand Down
3 changes: 3 additions & 0 deletions pandera/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,13 @@
pandas_version,
)

import pandera.backends

from pandera.schema_inference.pandas import infer_schema
from pandera.decorators import check_input, check_io, check_output, check_types
from pandera.version import __version__


if platform.system() != "Windows":
# pylint: disable=ungrouped-imports
from pandera.dtypes import Complex256, Float128
Expand Down
7 changes: 7 additions & 0 deletions pandera/backends/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""Pandera backends."""

# ensure that base builtin checks and hypothesis are registered
import pandera.backends.base.builtin_checks
import pandera.backends.base.builtin_hypotheses

import pandera.backends.pandas
12 changes: 9 additions & 3 deletions pandera/backends/base.py → pandera/backends/base/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
"""Base functions for Parsing, Validation, and Error Reporting Backends.
"""Base classes for parsing, validation, and error Reporting Backends.
This class should implement a common interface of operations needed for
These classes implement a common interface of operations needed for
data validation. These operations are exposed as methods that are composed
together to implement the pandera schema specification.
"""

from abc import ABC
from typing import Optional
from typing import Any, Dict, List, Optional


class BaseSchemaBackend(ABC):
Expand Down Expand Up @@ -100,6 +100,12 @@ def check_dtype(self, check_obj, schema):
"""Core check that checks the data type of a check object."""
raise NotImplementedError

def failure_cases_metadata(
self, schema_name: str, schema_errors: List[Dict[str, Any]]
):
"""Get failure cases metadata for lazy validation."""
raise NotImplementedError


class BaseCheckBackend(ABC):
"""Abstract base class for a check backend implementation."""
Expand Down
98 changes: 98 additions & 0 deletions pandera/backends/base/builtin_checks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# pylint: disable=missing-function-docstring
"""Built-in check functions base implementation.
This module contains check function abstract definitions that correspond to
the pandera.core.base.checks.Check methods. These functions do not actually
implement any validation logic and serve as the entrypoint for dispatching
specific implementations based on the data object type, e.g.
`pandas.DataFrame`s.
"""

import re
from typing import Any, Iterable, TypeVar, Union

from pandera.core.checks import Check


T = TypeVar("T")


@Check.register_builtin_check_fn
def equal_to(data: Any, value: Any) -> Any:
raise NotImplementedError


@Check.register_builtin_check_fn
def not_equal_to(data: Any, value: Any) -> Any:
raise NotImplementedError


@Check.register_builtin_check_fn
def greater_than(data: Any, min_value: Any) -> Any:
raise NotImplementedError


@Check.register_builtin_check_fn
def greater_than_or_equal_to(data: Any, min_value: Any) -> Any:
raise NotImplementedError


@Check.register_builtin_check_fn
def less_than(data: Any, max_value: Any) -> Any:
raise NotImplementedError


@Check.register_builtin_check_fn
def less_than_or_equal_to(data: Any, max_value: Any) -> Any:
raise NotImplementedError


@Check.register_builtin_check_fn
def in_range(
data: Any,
min_value: T,
max_value: T,
include_min: bool = True,
include_max: bool = True,
) -> Any:
raise NotImplementedError


@Check.register_builtin_check_fn
def isin(data: Any, allowed_values: Iterable) -> Any:
raise NotImplementedError


@Check.register_builtin_check_fn
def notin(data: Any, forbidden_values: Iterable) -> Any:
raise NotImplementedError


@Check.register_builtin_check_fn
def str_matches(data: Any, pattern: Union[str, re.Pattern]) -> Any:
raise NotImplementedError


@Check.register_builtin_check_fn
def str_contains(data: Any, pattern: Union[str, re.Pattern]) -> Any:
raise NotImplementedError


@Check.register_builtin_check_fn
def str_startswith(data: Any, string: str) -> Any:
raise NotImplementedError


@Check.register_builtin_check_fn
def str_endswith(data: Any, string: str) -> Any:
raise NotImplementedError


@Check.register_builtin_check_fn
def str_length(data: Any, min_value: int = None, max_value: int = None) -> Any:
raise NotImplementedError


@Check.register_builtin_check_fn
def unique_values_eq(data: Any, values: Iterable) -> Any:
raise NotImplementedError
31 changes: 31 additions & 0 deletions pandera/backends/base/builtin_hypotheses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# pylint: disable=missing-function-docstring
"""Built-in hypothesis functions base implementation.
This module contains hypothesis function abstract definitions that
correspond to the pandera.core.base.checks.Check methods. These functions do not
actually implement any validation logic and serve as the entrypoint for
dispatching specific implementations based on the data object type, e.g.
`pandas.DataFrame`s.
"""

from typing import Any, Tuple

from pandera.core.hypotheses import Hypothesis


@Hypothesis.register_builtin_check_fn
def two_sample_ttest(
*samples: Tuple[Any, ...],
equal_var: bool = True,
nan_policy: str = "propagate",
):
raise NotImplementedError


@Hypothesis.register_builtin_check_fn
def one_sample_ttest(
*samples: Tuple[Any, ...],
popmean: float,
nan_policy: str = "propagate",
):
raise NotImplementedError
38 changes: 38 additions & 0 deletions pandera/backends/pandas/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""Pandas backend implementation for schemas and checks."""

import pandas as pd

import pandera.typing
from pandera.core.checks import Check
from pandera.core.hypotheses import Hypothesis

from pandera.backends.pandas.checks import PandasCheckBackend
from pandera.backends.pandas.hypotheses import PandasHypothesisBackend
from pandera.backends.pandas import builtin_checks, builtin_hypotheses


data_types = [pd.DataFrame, pd.Series]

if pandera.typing.dask.DASK_INSTALLED:
import dask.dataframe as dd

data_types.extend([dd.DataFrame, dd.Series])

if pandera.typing.modin.MODIN_INSTALLED:
import modin.pandas as mpd

data_types.extend([mpd.DataFrame, mpd.Series])

if pandera.typing.pyspark.PYSPARK_INSTALLED:
import pyspark.pandas as ps

data_types.extend([ps.DataFrame, ps.Series])

if pandera.typing.geopandas.GEOPANDAS_INSTALLED:
import geopandas as gpd

data_types.extend([gpd.GeoDataFrame, gpd.GeoSeries])

for t in data_types:
Check.register_backend(t, PandasCheckBackend)
Hypothesis.register_backend(t, PandasHypothesisBackend)
4 changes: 3 additions & 1 deletion pandera/backends/pandas/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,9 @@ def validate(

if lazy and error_handler.collected_errors:
raise SchemaErrors(
schema, error_handler.collected_errors, check_obj
schema=schema,
schema_errors=error_handler.collected_errors,
data=check_obj,
)
return check_obj

Expand Down
22 changes: 21 additions & 1 deletion pandera/backends/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import warnings
from typing import (
Any,
Dict,
FrozenSet,
Iterable,
List,
Expand All @@ -17,10 +19,12 @@
from pandera.backends.pandas.error_formatters import (
format_generic_error_message,
format_vectorized_error_message,
consolidate_failure_cases,
summarize_failure_cases,
reshape_failure_cases,
scalar_failure_case,
)
from pandera.errors import SchemaError
from pandera.errors import SchemaError, FailureCaseMetadata


class ColumnInfo(NamedTuple):
Expand Down Expand Up @@ -118,3 +122,19 @@ def run_check(
check_output=check_result.check_output,
)
return check_result.check_passed

def failure_cases_metadata(
self,
schema_name: str,
schema_errors: List[Dict[str, Any]],
) -> FailureCaseMetadata:
"""Create failure cases metadata required for SchemaErrors exception."""
failure_cases = consolidate_failure_cases(schema_errors)
message, error_counts = summarize_failure_cases(
schema_name, schema_errors, failure_cases
)
return FailureCaseMetadata(
failure_cases=failure_cases,
message=message,
error_counts=error_counts,
)
Loading

0 comments on commit d6c2078

Please sign in to comment.