From 043c1c2e5188b5266090042a4048367594434c24 Mon Sep 17 00:00:00 2001 From: Niels Bantilan Date: Wed, 19 Oct 2022 09:42:54 -0400 Subject: [PATCH 1/8] move jupyterlite_sphinx to pip deps Signed-off-by: Niels Bantilan --- requirements-dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 790bf826b..03797698f 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -54,4 +54,4 @@ types-pyyaml types-pkg_resources types-requests types-pytz -jupyterlite_sphinx \ No newline at end of file +jupyterlite_sphinx From 7e4c8f4d7ee46ec3552fbd67afa5c732f1aed39b Mon Sep 17 00:00:00 2001 From: Philippe Prados Date: Mon, 25 Jul 2022 12:26:30 +0200 Subject: [PATCH 2/8] Add compatibility with cudf Missing: unit tests --- pandera/typing/cudf.py | 350 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 350 insertions(+) create mode 100644 pandera/typing/cudf.py diff --git a/pandera/typing/cudf.py b/pandera/typing/cudf.py new file mode 100644 index 000000000..426211393 --- /dev/null +++ b/pandera/typing/cudf.py @@ -0,0 +1,350 @@ +"""Typing definitions and helpers.""" +# pylint:disable=abstract-method,disable=too-many-ancestors +import io +from typing import _type_check # type: ignore[attr-defined] +from typing import TYPE_CHECKING, Any, Generic, TypeVar + +<<<<<<< HEAD +======= +import cudf +>>>>>>> Add compatibility with cudf +import pandas as pd + +from ..errors import SchemaError, SchemaInitError +from .common import DataFrameBase, GenericDtype, IndexBase, Schema, SeriesBase +from .formats import Formats + +try: + from typing import _GenericAlias # type: ignore[attr-defined] +except ImportError: # pragma: no cover + _GenericAlias = None + +<<<<<<< HEAD +try: + import cudf + + try: + from pydantic.fields import ModelField + except ImportError: + ModelField = Any # type: ignore + + + # pylint:disable=too-few-public-methods + class Index(IndexBase, cudf.Index, Generic[GenericDtype]): + """Representation of pandas.Index, only used for type annotation. + + *new in 0.5.0* + """ + + + # pylint:disable=too-few-public-methods + class Series(SeriesBase, cudf.Series, Generic[GenericDtype]): # type: ignore + """Representation of pandas.Series, only used for type annotation. + + *new in 0.5.0* + """ + + if hasattr(pd.Series, "__class_getitem__") and _GenericAlias: + def __class_getitem__(cls, item): + """Define this to override the patch that pyspark.pandas performs on pandas. + https://github.com/apache/spark/blob/master/python/pyspark/pandas/__init__.py#L124-L144 + """ + _type_check(item, "Parameters to generic types must be types.") + return _GenericAlias(cls, item) + + + # pylint:disable=invalid-name + if TYPE_CHECKING: + T = TypeVar("T") # pragma: no cover + else: + T = Schema + + + # pylint:disable=too-few-public-methods + class DataFrame(DataFrameBase, cudf.DataFrame, Generic[T]): + """ + A generic type for pandas.DataFrame. + + *new in 0.5.0* + """ + + if hasattr(pd.DataFrame, "__class_getitem__") and _GenericAlias: + def __class_getitem__(cls, item): + """Define this to override the patch that pyspark.pandas performs on pandas. + https://github.com/apache/spark/blob/master/python/pyspark/pandas/__init__.py#L124-L144 + """ + _type_check(item, "Parameters to generic types must be types.") + return _GenericAlias(cls, item) + + @classmethod + def __get_validators__(cls): + yield cls.pydantic_validate + + @classmethod + def from_format(cls, obj: Any, config) -> pd.DataFrame: + """ + Converts serialized data from a specific format + specified in the :py:class:`pandera.model.SchemaModel` config options + ``from_format`` and ``from_format_kwargs``. + + :param obj: object representing a serialized dataframe. + :param config: schema model configuration object. + """ + if config.from_format is None: + if not isinstance(obj, pd.DataFrame): + try: + obj = pd.DataFrame(obj) + except Exception as exc: + raise ValueError( + f"Expected pd.DataFrame, found {type(obj)}" + ) from exc + return obj + + reader = { + Formats.dict: pd.DataFrame, + Formats.csv: pd.read_csv, + Formats.json: pd.read_json, + Formats.feather: pd.read_feather, + Formats.parquet: pd.read_parquet, + Formats.pickle: pd.read_pickle, + }[Formats(config.from_format)] + + return reader(obj, **(config.from_format_kwargs or {})) + + @classmethod + def to_format(cls, data: pd.DataFrame, config) -> Any: + """ + Converts a dataframe to the format specified in the + :py:class:`pandera.model.SchemaModel` config options ``to_format`` + and ``to_format_kwargs``. + + :param data: convert this data to the specified format + :param config: :py:cl + """ + if config.to_format is None: + return data + + writer, buffer = { + Formats.dict: (data.to_dict, None), + Formats.csv: (data.to_csv, None), + Formats.json: (data.to_json, None), + Formats.feather: (data.to_feather, io.BytesIO()), + Formats.parquet: (data.to_parquet, io.BytesIO()), + Formats.pickle: (data.to_pickle, io.BytesIO()), + }[Formats(config.to_format)] + + args = [] if buffer is None else [buffer] + out = writer(*args, **(config.to_format_kwargs or {})) + if buffer is None: + return out + elif buffer.closed: + raise IOError( + f"pandas=={pd.__version__} closed the buffer automatically " + f"using the serialization method {writer}. Use a later " + "version of pandas or use a different the serialization " + "format." + ) + buffer.seek(0) + return buffer + + @classmethod + def _get_schema(cls, field: ModelField): + if not field.sub_fields: + raise TypeError( + "Expected a typed pandera.typing.DataFrame," + " e.g. DataFrame[Schema]" + ) + schema_model = field.sub_fields[0].type_ + try: + schema = schema_model.to_schema() + except SchemaInitError as exc: + raise ValueError( + f"Cannot use {cls.__name__} as a pydantic type as its " + "SchemaModel cannot be converted to a DataFrameSchema.\n" + f"Please revisit the model to address the following errors:" + f"\n{exc}" + ) from exc + return schema_model, schema + + @classmethod + def pydantic_validate(cls, obj: Any, field: ModelField) -> pd.DataFrame: + """ + Verify that the input can be converted into a pandas dataframe that + meets all schema requirements. + """ + schema_model, schema = cls._get_schema(field) + data = cls.from_format(obj, schema_model.__config__) + + try: + valid_data = schema.validate(data) + except SchemaError as exc: + raise ValueError(str(exc)) from exc + + return cls.to_format(valid_data, schema_model.__config__) + +except ImportError: + pass # Ignore + +======= + +try: + from pydantic.fields import ModelField +except ImportError: + ModelField = Any # type: ignore + + +# pylint:disable=too-few-public-methods +class Index(IndexBase, cudf.Index, Generic[GenericDtype]): + """Representation of pandas.Index, only used for type annotation. + + *new in 0.5.0* + """ + + +# pylint:disable=too-few-public-methods +class Series(SeriesBase, cudf.Series, Generic[GenericDtype]): # type: ignore + """Representation of pandas.Series, only used for type annotation. + + *new in 0.5.0* + """ + + if hasattr(pd.Series, "__class_getitem__") and _GenericAlias: + + def __class_getitem__(cls, item): + """Define this to override the patch that pyspark.pandas performs on pandas. + https://github.com/apache/spark/blob/master/python/pyspark/pandas/__init__.py#L124-L144 + """ + _type_check(item, "Parameters to generic types must be types.") + return _GenericAlias(cls, item) + + +# pylint:disable=invalid-name +if TYPE_CHECKING: + T = TypeVar("T") # pragma: no cover +else: + T = Schema + + +# pylint:disable=too-few-public-methods +class DataFrame(DataFrameBase, cudf.DataFrame, Generic[T]): + """ + A generic type for pandas.DataFrame. + + *new in 0.5.0* + """ + + if hasattr(pd.DataFrame, "__class_getitem__") and _GenericAlias: + + def __class_getitem__(cls, item): + """Define this to override the patch that pyspark.pandas performs on pandas. + https://github.com/apache/spark/blob/master/python/pyspark/pandas/__init__.py#L124-L144 + """ + _type_check(item, "Parameters to generic types must be types.") + return _GenericAlias(cls, item) + + @classmethod + def __get_validators__(cls): + yield cls.pydantic_validate + + @classmethod + def from_format(cls, obj: Any, config) -> pd.DataFrame: + """ + Converts serialized data from a specific format + specified in the :py:class:`pandera.model.SchemaModel` config options + ``from_format`` and ``from_format_kwargs``. + + :param obj: object representing a serialized dataframe. + :param config: schema model configuration object. + """ + if config.from_format is None: + if not isinstance(obj, pd.DataFrame): + try: + obj = pd.DataFrame(obj) + except Exception as exc: + raise ValueError( + f"Expected pd.DataFrame, found {type(obj)}" + ) from exc + return obj + + reader = { + Formats.dict: pd.DataFrame, + Formats.csv: pd.read_csv, + Formats.json: pd.read_json, + Formats.feather: pd.read_feather, + Formats.parquet: pd.read_parquet, + Formats.pickle: pd.read_pickle, + }[Formats(config.from_format)] + + return reader(obj, **(config.from_format_kwargs or {})) + + @classmethod + def to_format(cls, data: pd.DataFrame, config) -> Any: + """ + Converts a dataframe to the format specified in the + :py:class:`pandera.model.SchemaModel` config options ``to_format`` + and ``to_format_kwargs``. + + :param data: convert this data to the specified format + :param config: :py:cl + """ + if config.to_format is None: + return data + + writer, buffer = { + Formats.dict: (data.to_dict, None), + Formats.csv: (data.to_csv, None), + Formats.json: (data.to_json, None), + Formats.feather: (data.to_feather, io.BytesIO()), + Formats.parquet: (data.to_parquet, io.BytesIO()), + Formats.pickle: (data.to_pickle, io.BytesIO()), + }[Formats(config.to_format)] + + args = [] if buffer is None else [buffer] + out = writer(*args, **(config.to_format_kwargs or {})) + if buffer is None: + return out + elif buffer.closed: + raise IOError( + f"pandas=={pd.__version__} closed the buffer automatically " + f"using the serialization method {writer}. Use a later " + "version of pandas or use a different the serialization " + "format." + ) + buffer.seek(0) + return buffer + + @classmethod + def _get_schema(cls, field: ModelField): + if not field.sub_fields: + raise TypeError( + "Expected a typed pandera.typing.DataFrame," + " e.g. DataFrame[Schema]" + ) + schema_model = field.sub_fields[0].type_ + try: + schema = schema_model.to_schema() + except SchemaInitError as exc: + raise ValueError( + f"Cannot use {cls.__name__} as a pydantic type as its " + "SchemaModel cannot be converted to a DataFrameSchema.\n" + f"Please revisit the model to address the following errors:" + f"\n{exc}" + ) from exc + return schema_model, schema + + @classmethod + def pydantic_validate(cls, obj: Any, field: ModelField) -> pd.DataFrame: + """ + Verify that the input can be converted into a pandas dataframe that + meets all schema requirements. + """ + schema_model, schema = cls._get_schema(field) + data = cls.from_format(obj, schema_model.__config__) + + try: + valid_data = schema.validate(data) + except SchemaError as exc: + raise ValueError(str(exc)) from exc + + return cls.to_format(valid_data, schema_model.__config__) +>>>>>>> Add compatibility with cudf From 2acdfcf0797c1938dad1ece18d07b4c49747b007 Mon Sep 17 00:00:00 2001 From: Philippe Prados Date: Fri, 29 Jul 2022 11:10:09 +0200 Subject: [PATCH 3/8] Accept the absence of cudf --- pandera/typing/cudf.py | 168 ----------------------------------------- 1 file changed, 168 deletions(-) diff --git a/pandera/typing/cudf.py b/pandera/typing/cudf.py index 426211393..ad260a3c6 100644 --- a/pandera/typing/cudf.py +++ b/pandera/typing/cudf.py @@ -4,10 +4,6 @@ from typing import _type_check # type: ignore[attr-defined] from typing import TYPE_CHECKING, Any, Generic, TypeVar -<<<<<<< HEAD -======= -import cudf ->>>>>>> Add compatibility with cudf import pandas as pd from ..errors import SchemaError, SchemaInitError @@ -19,7 +15,6 @@ except ImportError: # pragma: no cover _GenericAlias = None -<<<<<<< HEAD try: import cudf @@ -185,166 +180,3 @@ def pydantic_validate(cls, obj: Any, field: ModelField) -> pd.DataFrame: except ImportError: pass # Ignore -======= - -try: - from pydantic.fields import ModelField -except ImportError: - ModelField = Any # type: ignore - - -# pylint:disable=too-few-public-methods -class Index(IndexBase, cudf.Index, Generic[GenericDtype]): - """Representation of pandas.Index, only used for type annotation. - - *new in 0.5.0* - """ - - -# pylint:disable=too-few-public-methods -class Series(SeriesBase, cudf.Series, Generic[GenericDtype]): # type: ignore - """Representation of pandas.Series, only used for type annotation. - - *new in 0.5.0* - """ - - if hasattr(pd.Series, "__class_getitem__") and _GenericAlias: - - def __class_getitem__(cls, item): - """Define this to override the patch that pyspark.pandas performs on pandas. - https://github.com/apache/spark/blob/master/python/pyspark/pandas/__init__.py#L124-L144 - """ - _type_check(item, "Parameters to generic types must be types.") - return _GenericAlias(cls, item) - - -# pylint:disable=invalid-name -if TYPE_CHECKING: - T = TypeVar("T") # pragma: no cover -else: - T = Schema - - -# pylint:disable=too-few-public-methods -class DataFrame(DataFrameBase, cudf.DataFrame, Generic[T]): - """ - A generic type for pandas.DataFrame. - - *new in 0.5.0* - """ - - if hasattr(pd.DataFrame, "__class_getitem__") and _GenericAlias: - - def __class_getitem__(cls, item): - """Define this to override the patch that pyspark.pandas performs on pandas. - https://github.com/apache/spark/blob/master/python/pyspark/pandas/__init__.py#L124-L144 - """ - _type_check(item, "Parameters to generic types must be types.") - return _GenericAlias(cls, item) - - @classmethod - def __get_validators__(cls): - yield cls.pydantic_validate - - @classmethod - def from_format(cls, obj: Any, config) -> pd.DataFrame: - """ - Converts serialized data from a specific format - specified in the :py:class:`pandera.model.SchemaModel` config options - ``from_format`` and ``from_format_kwargs``. - - :param obj: object representing a serialized dataframe. - :param config: schema model configuration object. - """ - if config.from_format is None: - if not isinstance(obj, pd.DataFrame): - try: - obj = pd.DataFrame(obj) - except Exception as exc: - raise ValueError( - f"Expected pd.DataFrame, found {type(obj)}" - ) from exc - return obj - - reader = { - Formats.dict: pd.DataFrame, - Formats.csv: pd.read_csv, - Formats.json: pd.read_json, - Formats.feather: pd.read_feather, - Formats.parquet: pd.read_parquet, - Formats.pickle: pd.read_pickle, - }[Formats(config.from_format)] - - return reader(obj, **(config.from_format_kwargs or {})) - - @classmethod - def to_format(cls, data: pd.DataFrame, config) -> Any: - """ - Converts a dataframe to the format specified in the - :py:class:`pandera.model.SchemaModel` config options ``to_format`` - and ``to_format_kwargs``. - - :param data: convert this data to the specified format - :param config: :py:cl - """ - if config.to_format is None: - return data - - writer, buffer = { - Formats.dict: (data.to_dict, None), - Formats.csv: (data.to_csv, None), - Formats.json: (data.to_json, None), - Formats.feather: (data.to_feather, io.BytesIO()), - Formats.parquet: (data.to_parquet, io.BytesIO()), - Formats.pickle: (data.to_pickle, io.BytesIO()), - }[Formats(config.to_format)] - - args = [] if buffer is None else [buffer] - out = writer(*args, **(config.to_format_kwargs or {})) - if buffer is None: - return out - elif buffer.closed: - raise IOError( - f"pandas=={pd.__version__} closed the buffer automatically " - f"using the serialization method {writer}. Use a later " - "version of pandas or use a different the serialization " - "format." - ) - buffer.seek(0) - return buffer - - @classmethod - def _get_schema(cls, field: ModelField): - if not field.sub_fields: - raise TypeError( - "Expected a typed pandera.typing.DataFrame," - " e.g. DataFrame[Schema]" - ) - schema_model = field.sub_fields[0].type_ - try: - schema = schema_model.to_schema() - except SchemaInitError as exc: - raise ValueError( - f"Cannot use {cls.__name__} as a pydantic type as its " - "SchemaModel cannot be converted to a DataFrameSchema.\n" - f"Please revisit the model to address the following errors:" - f"\n{exc}" - ) from exc - return schema_model, schema - - @classmethod - def pydantic_validate(cls, obj: Any, field: ModelField) -> pd.DataFrame: - """ - Verify that the input can be converted into a pandas dataframe that - meets all schema requirements. - """ - schema_model, schema = cls._get_schema(field) - data = cls.from_format(obj, schema_model.__config__) - - try: - valid_data = schema.validate(data) - except SchemaError as exc: - raise ValueError(str(exc)) from exc - - return cls.to_format(valid_data, schema_model.__config__) ->>>>>>> Add compatibility with cudf From 5886fac07951aa21d446210bf73bc7cde1775d09 Mon Sep 17 00:00:00 2001 From: Philippe Prados Date: Wed, 19 Oct 2022 17:42:44 +0200 Subject: [PATCH 4/8] Add cudf unit test --- pandera/typing/__init__.py | 5 +++++ pandera/typing/cudf.py | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pandera/typing/__init__.py b/pandera/typing/__init__.py index d28d50522..63150e4c3 100644 --- a/pandera/typing/__init__.py +++ b/pandera/typing/__init__.py @@ -57,6 +57,11 @@ SERIES_TYPES.update({modin.Series}) INDEX_TYPES.update({modin.Index}) +if cudf.CUDF_INSTALLED: + DATAFRAME_TYPES.update({cudf.DataFrame}) + SERIES_TYPES.update({cudf.Series}) + INDEX_TYPES.update({cudf.Index}) + if pyspark.PYSPARK_INSTALLED: DATAFRAME_TYPES.update({pyspark.DataFrame}) SERIES_TYPES.update({pyspark.Series}) diff --git a/pandera/typing/cudf.py b/pandera/typing/cudf.py index ad260a3c6..2d93762e3 100644 --- a/pandera/typing/cudf.py +++ b/pandera/typing/cudf.py @@ -176,7 +176,7 @@ def pydantic_validate(cls, obj: Any, field: ModelField) -> pd.DataFrame: raise ValueError(str(exc)) from exc return cls.to_format(valid_data, schema_model.__config__) - + CUDF_INSTALLED=True except ImportError: - pass # Ignore + CUDF_INSTALLED=False From ae2853b89792347de7672c8efecc3bdfedc25434 Mon Sep 17 00:00:00 2001 From: Philippe Prados Date: Thu, 20 Oct 2022 09:40:00 +0200 Subject: [PATCH 5/8] Add cudf unit test and rebase from fix-dependencies --- environment.yml | 3 + pandera/core/pandas/types.py | 8 + pandera/cudf_accessor.py | 98 ++++++ pandera/errors.py | 16 + pandera/typing/__init__.py | 2 +- pandera/typing/cudf.py | 7 +- tests/cudf/conftest.py | 9 + tests/cudf/test_cudf_accessor.py | 21 ++ tests/cudf/test_schemas_on_cudf.py | 458 +++++++++++++++++++++++++++++ 9 files changed, 618 insertions(+), 4 deletions(-) create mode 100644 pandera/cudf_accessor.py create mode 100644 tests/cudf/conftest.py create mode 100644 tests/cudf/test_cudf_accessor.py create mode 100644 tests/cudf/test_schemas_on_cudf.py diff --git a/environment.yml b/environment.yml index 304e99ce5..74e624185 100644 --- a/environment.yml +++ b/environment.yml @@ -31,6 +31,9 @@ dependencies: - modin - protobuf <= 3.20.3 + # cudf extra + - cudf + # dask extra - dask - distributed diff --git a/pandera/core/pandas/types.py b/pandera/core/pandas/types.py index 661ba20e0..31ee7acbe 100644 --- a/pandera/core/pandas/types.py +++ b/pandera/core/pandas/types.py @@ -74,6 +74,14 @@ def supported_types() -> SupportedTypes: index_types.append(dd.Index) except ImportError: pass + try: + import cudf + + table_types.append(cudf.DataFrame) + field_types.append(cudf.Series) + index_types.append(cudf.Index) + except ImportError: + pass return SupportedTypes( tuple(table_types), diff --git a/pandera/cudf_accessor.py b/pandera/cudf_accessor.py new file mode 100644 index 000000000..e72d07483 --- /dev/null +++ b/pandera/cudf_accessor.py @@ -0,0 +1,98 @@ +"""Custom accessor functionality for modin. + +Source code adapted from pyspark.pandas implementation: +https://spark.apache.org/docs/3.2.0/api/python/reference/pyspark.pandas/api/pyspark.pandas.extensions.register_dataframe_accessor.html?highlight=register_dataframe_accessor#pyspark.pandas.extensions.register_dataframe_accessor +""" + +import warnings + +from pandera.pandas_accessor import ( + PanderaDataFrameAccessor, + PanderaSeriesAccessor, +) + + +# pylint: disable=too-few-public-methods +class CachedAccessor: + """ + Custom property-like object. + + A descriptor for caching accessors: + + :param name: Namespace that accessor's methods, properties, etc will be + accessed under, e.g. "foo" for a dataframe accessor yields the accessor + ``df.foo`` + :param cls: Class with the extension methods. + + For accessor, the class's __init__ method assumes that you are registering + an accessor for one of ``Series``, ``DataFrame``, or ``Index``. + """ + + def __init__(self, name, accessor): + self._name = name + self._accessor = accessor + + def __get__(self, obj, cls): + if obj is None: # pragma: no cover + return self._accessor + accessor_obj = self._accessor(obj) + object.__setattr__(obj, self._name, accessor_obj) + return accessor_obj + + +def _register_accessor(name, cls): + """ + Register a custom accessor on {class} objects. + + :param name: Name under which the accessor should be registered. A warning + is issued if this name conflicts with a preexisting attribute. + :returns: A class decorator callable. + """ + + def decorator(accessor): + if hasattr(cls, name): + msg = ( + f"registration of accessor {accessor} under name '{name}' for " + "type {cls.__name__} is overriding a preexisting attribute " + "with the same name." + ) + + warnings.warn( + msg, + UserWarning, + stacklevel=2, + ) + setattr(cls, name, CachedAccessor(name, accessor)) + return accessor + + return decorator + + +def register_dataframe_accessor(name): + """ + Register a custom accessor with a DataFrame + + :param name: name used when calling the accessor after its registered + :returns: a class decorator callable. + """ + # pylint: disable=import-outside-toplevel + from cudf import DataFrame + + return _register_accessor(name, DataFrame) + + +def register_series_accessor(name): + """ + Register a custom accessor with a Series object + + :param name: name used when calling the accessor after its registered + :returns: a callable class decorator + """ + # pylint: disable=import-outside-toplevel + from cudf import Series + + return _register_accessor(name, Series) + + +register_dataframe_accessor("pandera")(PanderaDataFrameAccessor) +register_series_accessor("pandera")(PanderaSeriesAccessor) diff --git a/pandera/errors.py b/pandera/errors.py index 035ed50bd..c98e3219a 100644 --- a/pandera/errors.py +++ b/pandera/errors.py @@ -289,6 +289,22 @@ def _parse_schema_errors(schema_errors: List[Dict[str, Any]]): for x in check_failure_cases ] + elif any( + type(x).__module__.startswith("cudf") + for x in check_failure_cases + ): + # pylint: disable=import-outside-toplevel + # The current version of cudf is not compatible with sort_values() of strings. + # The workaround is to convert all the cuda dataframe to pandas. + import cudf + + # concat_fn = cudf.concat + check_failure_cases = [ + # x if isinstance(x, cudf.DataFrame) else cudf.DataFrame(x) + x.to_pandas() if isinstance(x, cudf.DataFrame) else x + for x in check_failure_cases + ] + failure_cases = ( concat_fn(check_failure_cases) .reset_index(drop=True) diff --git a/pandera/typing/__init__.py b/pandera/typing/__init__.py index 63150e4c3..68491309d 100644 --- a/pandera/typing/__init__.py +++ b/pandera/typing/__init__.py @@ -6,7 +6,7 @@ from typing import Set, Type -from pandera.typing import dask, fastapi, geopandas, modin, pyspark +from pandera.typing import dask, fastapi, geopandas, modin, cudf, pyspark from pandera.typing.common import ( BOOL, INT8, diff --git a/pandera/typing/cudf.py b/pandera/typing/cudf.py index 2d93762e3..8efcd6411 100644 --- a/pandera/typing/cudf.py +++ b/pandera/typing/cudf.py @@ -176,7 +176,8 @@ def pydantic_validate(cls, obj: Any, field: ModelField) -> pd.DataFrame: raise ValueError(str(exc)) from exc return cls.to_format(valid_data, schema_model.__config__) - CUDF_INSTALLED=True -except ImportError: - CUDF_INSTALLED=False + + CUDF_INSTALLED = True +except ImportError: + CUDF_INSTALLED = False diff --git a/tests/cudf/conftest.py b/tests/cudf/conftest.py new file mode 100644 index 000000000..852eea3fb --- /dev/null +++ b/tests/cudf/conftest.py @@ -0,0 +1,9 @@ +"""Registers fixtures for core""" + +import os + +import pytest + +# pylint: disable=unused-import +from tests.core.checks_fixtures import custom_check_teardown # noqa + diff --git a/tests/cudf/test_cudf_accessor.py b/tests/cudf/test_cudf_accessor.py new file mode 100644 index 000000000..8f9a1b861 --- /dev/null +++ b/tests/cudf/test_cudf_accessor.py @@ -0,0 +1,21 @@ +"""Unit tests of cudf accessor functionality. +""" + +import pytest + +from pandera import cudf_accessor + + +# pylint: disable=too-few-public-methods +class CustomAccessor: + """Mock accessor class""" + + def __init__(self, obj): + self._obj = obj + + +def test_cudf_accessor_warning(): + """Test that cudf accessor raises warning when name already exists.""" + cudf_accessor.register_dataframe_accessor("foo")(CustomAccessor) + with pytest.warns(UserWarning): + cudf_accessor.register_dataframe_accessor("foo")(CustomAccessor) diff --git a/tests/cudf/test_schemas_on_cudf.py b/tests/cudf/test_schemas_on_cudf.py new file mode 100644 index 000000000..2199c889f --- /dev/null +++ b/tests/cudf/test_schemas_on_cudf.py @@ -0,0 +1,458 @@ +"""Unit tests for cudf data structures.""" + +import typing +from unittest.mock import MagicMock + +import cudf +import pandas as pd +import pytest + +import pandera as pa +from pandera import extensions +from pandera.engines import numpy_engine, pandas_engine +from pandera.typing.modin import DataFrame, Index, Series, modin_version +from tests.strategies.test_strategies import NULLABLE_DTYPES +from tests.strategies.test_strategies import ( + SUPPORTED_DTYPES as SUPPORTED_STRATEGY_DTYPES, +) +from tests.strategies.test_strategies import ( + UNSUPPORTED_DTYPE_CLS as UNSUPPORTED_STRATEGY_DTYPE_CLS, +) + +try: + import hypothesis + import hypothesis.strategies as st +except ImportError: + hypothesis = MagicMock() + st = MagicMock() + + +UNSUPPORTED_STRATEGY_DTYPE_CLS = set(UNSUPPORTED_STRATEGY_DTYPE_CLS) +UNSUPPORTED_STRATEGY_DTYPE_CLS.add(numpy_engine.Object) + +TEST_DTYPES_ON_CUDF = [] +# pylint: disable=redefined-outer-name +# for dtype_cls in pandas_engine.Engine.get_registered_dtypes(): +# if ( +# dtype_cls in UNSUPPORTED_STRATEGY_DTYPE_CLS +# or ( +# pandas_engine.Engine.dtype(dtype_cls) +# not in SUPPORTED_STRATEGY_DTYPES +# ) +# or not ( +# pandas_engine.GEOPANDAS_INSTALLED +# and dtype_cls == pandas_engine.Geometry +# ) +# ): +# continue +# TEST_DTYPES_ON_CUDF.append(pandas_engine.Engine.dtype(dtype_cls)) + + +@pytest.mark.parametrize("coerce", [True, False]) +def test_dataframe_schema_case(coerce): + """Test a simple schema case.""" + schema = pa.DataFrameSchema( + { + "int_column": pa.Column(int, pa.Check.ge(0)), + "float_column": pa.Column(float, pa.Check.le(0)), + # cudf not implemented "str_column": pa.Column(str, pa.Check.isin(list("abcde"))), + }, + coerce=coerce, + ) + cdf = cudf.DataFrame( + { + "int_column": range(10), + "float_column": [float(-x) for x in range(10)], + # cudf not implemented "str_column": list("aabbcceedd"), + } + ) + assert isinstance(schema.validate(cdf), cudf.DataFrame) + + +def _test_datatype_with_schema( + schema: typing.Union[pa.DataFrameSchema, pa.SeriesSchema], + data: st.DataObject, +): + """Test pandera datatypes against modin data containers.""" + data_container_cls = { + pa.DataFrameSchema: cudf.DataFrame, + pa.SeriesSchema: cudf.Series, + pa.Column: cudf.DataFrame, + }[type(schema)] + + sample = data.draw(schema.strategy(size=3)) + assert isinstance(schema(data_container_cls(sample)), data_container_cls) + + +@pytest.mark.parametrize("dtype_cls", TEST_DTYPES_ON_CUDF) +@pytest.mark.parametrize("coerce", [True, False]) +@hypothesis.given(st.data()) +def test_dataframe_schema_dtypes( + dtype_cls: pandas_engine.DataType, + coerce: bool, + data: st.DataObject, +): + """ + Test that all supported modin data types work as expected for dataframes. + """ + dtype = pandas_engine.Engine.dtype(dtype_cls) + schema = pa.DataFrameSchema({"column": pa.Column(dtype)}, coerce=coerce) + with pytest.warns( + UserWarning, match="Distributing .+ object. This may take some time." + ): + _test_datatype_with_schema(schema, data) + + +@pytest.mark.parametrize("dtype_cls", TEST_DTYPES_ON_CUDF) +@pytest.mark.parametrize("coerce", [True, False]) +@pytest.mark.parametrize("schema_cls", [pa.SeriesSchema, pa.Column]) +@hypothesis.given(st.data()) +def test_field_schema_dtypes( + dtype_cls: pandas_engine.DataType, + coerce: bool, + schema_cls, + data: st.DataObject, +): + """ + Test that all supported modin data types work as expected for series. + """ + schema = schema_cls(dtype_cls, name="field", coerce=coerce) + _test_datatype_with_schema(schema, data) + + +@pytest.mark.parametrize( + "dtype", + [ + int, + float, + bool, + # str, + # pandas_engine.DateTime, + ], +) +@pytest.mark.parametrize("coerce", [True, False]) +@pytest.mark.parametrize("schema_cls", [pa.Index]) +@hypothesis.given(st.data()) +def test_index_dtypes( + dtype: pandas_engine.DataType, + coerce: bool, + schema_cls, + data: st.DataObject, +): + """Test cudf Index and MultiIndex on subset of datatypes. + + Only test basic datatypes since index handling in pandas is already a + little finicky. + """ + if schema_cls is pa.Index: + schema = schema_cls(dtype, name="field", coerce=coerce) + else: + schema = schema_cls(indexes=[pa.Index(dtype, name="field")]) + schema.coerce = coerce + sample = data.draw(schema.strategy(size=3)) + assert isinstance( + schema(cudf.DataFrame(pd.DataFrame(index=sample))), cudf.DataFrame + ) + + +@pytest.mark.parametrize( + "dtype", + [ + dt + for dt in TEST_DTYPES_ON_CUDF + # pylint: disable=no-value-for-parameter + if dt in NULLABLE_DTYPES + and not ( + pandas_engine.GEOPANDAS_INSTALLED + and dt == pandas_engine.Engine.dtype(pandas_engine.Geometry) + ) + ], +) +@hypothesis.given(st.data()) +@hypothesis.settings( + suppress_health_check=[hypothesis.HealthCheck.too_slow], +) +def test_nullable( + dtype: pandas_engine.DataType, + data: st.DataObject, +): + """Test nullable checks on cudf dataframes.""" + checks = None + nullable_schema = pa.DataFrameSchema( + {"field": pa.Column(dtype, checks=checks, nullable=True)} + ) + nonnullable_schema = pa.DataFrameSchema( + {"field": pa.Column(dtype, checks=checks, nullable=False)} + ) + null_sample = data.draw(nullable_schema.strategy(size=5)) + nonnull_sample = data.draw(nonnullable_schema.strategy(size=5)) + + ks_null_sample = cudf.DataFrame(null_sample) + ks_nonnull_sample = cudf.DataFrame(nonnull_sample) + n_nulls = ks_null_sample.isna().sum().item() + assert ks_nonnull_sample.notna().all().item() + assert n_nulls >= 0 + if n_nulls > 0: + with pytest.raises(pa.errors.SchemaError): + nonnullable_schema(ks_null_sample) + + +def test_required_column(): + """Test the required column raises error.""" + required_schema = pa.DataFrameSchema( + {"field": pa.Column(int, required=True)} + ) + schema = pa.DataFrameSchema({"field_": pa.Column(int, required=False)}) + + data = cudf.DataFrame({"field": [1, 2, 3]}) + + assert isinstance(required_schema(data), cudf.DataFrame) + assert isinstance(schema(data), cudf.DataFrame) + + with pytest.raises(pa.errors.SchemaError): + required_schema(cudf.DataFrame({"another_field": [1, 2, 3]})) + schema(cudf.DataFrame({"another_field": [1, 2, 3]})) + + +@pytest.mark.parametrize("from_dtype", [bool, float, int]) +@pytest.mark.parametrize("to_dtype", [float, int, str, bool]) +@hypothesis.given(st.data()) +def test_dtype_coercion(from_dtype, to_dtype, data): + """Test the datatype coercion provides informative errors.""" + from_schema = pa.DataFrameSchema({"field": pa.Column(from_dtype)}) + to_schema = pa.DataFrameSchema({"field": pa.Column(to_dtype, coerce=True)}) + + pd_sample = data.draw(from_schema.strategy(size=3)) + sample = cudf.DataFrame(pd_sample) + + if from_dtype is to_dtype: + assert isinstance(to_schema(sample), cudf.DataFrame) + return + + if from_dtype is str and to_dtype in {int, float}: + try: + result = to_schema(sample) + assert result["field"].dtype == to_dtype + except pa.errors.SchemaError as err: + for x in err.failure_cases.failure_case: + with pytest.raises(ValueError): + to_dtype(x) + return + + assert isinstance(to_schema(sample), cudf.DataFrame) + + +def test_strict_schema(): + """Test schema strictness.""" + strict_schema = pa.DataFrameSchema({"field": pa.Column()}, strict=True) + non_strict_schema = pa.DataFrameSchema({"field": pa.Column()}) + + strict_df = cudf.DataFrame({"field": [1]}) + non_strict_df = cudf.DataFrame({"field": [1], "foo": [2]}) + + strict_schema(strict_df) + non_strict_schema(strict_df) + + with pytest.raises( + pa.errors.SchemaError, match="column 'foo' not in DataFrameSchema" + ): + strict_schema(non_strict_df) + + non_strict_schema(non_strict_df) + + +# pylint: disable=unused-argument +def test_custom_checks(custom_check_teardown): + """Test that custom checks can be executed.""" + + # @extensions.register_check_method(statistics=["value"]) + # def cudf_eq(cudf_obj, *, value): # PPR + # return cudf_obj == value + # + # custom_schema = pa.DataFrameSchema( + # {"field": pa.Column(checks=pa.Check(lambda s: s == 0, name="custom"))} + # ) + # + # custom_registered_schema = pa.DataFrameSchema( + # {"field": pa.Column(checks=pa.Check.cudf_eq(0))} + # ) + # + # for schema in (custom_schema, custom_registered_schema): + # schema(cudf.DataFrame({"field": [0] * 100})) + # + # try: + # schema(cudf.DataFrame({"field": [-1] * 100})) + # except pa.errors.SchemaError as err: + # assert (err.failure_cases["failure_case"] == -1).all() + pass + +def test_schema_model(): + # pylint: disable=missing-class-docstring + """Test that SchemaModel subclasses work on cudf dataframes.""" + + # pylint: disable=too-few-public-methods + class Schema(pa.SchemaModel): + int_field: pa.typing.cudf.Series[int] = pa.Field(gt=0) + float_field: pa.typing.cudf.Series[float] = pa.Field(lt=0) + # in_field: pa.typing.cudf.Series[str] = pa.Field(isin=[1, 2, 3]) + + valid_df = cudf.DataFrame( + { + "int_field": [1, 2, 3], + "float_field": [-1.1, -2.1, -3.1], + # "in_field": [1, 2, 3], + } + ) + invalid_df = cudf.DataFrame( + { + "int_field": [-1], + "field_field": [1.0], + # "in_field": [4], + } + ) + + Schema.validate(valid_df) + try: + Schema.validate(invalid_df, lazy=True) + except pa.errors.SchemaErrors as err: + expected_failures = {-1, "float_field"} + assert ( + set(err.failure_cases["failure_case"].tolist()) + == expected_failures + ) + + +@pytest.mark.parametrize( + "check,valid,invalid", + [ + [pa.Check.eq(0), 0, -1], + [pa.Check.ne(0), 1, 0], + [pa.Check.gt(0), 1, -1], + [pa.Check.ge(0), 0, -1], + [pa.Check.lt(0), -1, 0], + [pa.Check.le(0), 0, 1], + [pa.Check.in_range(0, 10), 5, -1], + # FIXME: a valider + # [pa.Check.isin(["a"]), "a", "b"], + # [pa.Check.notin(["a"]), "b", "a"], + # [pa.Check.str_matches("^a$"), "a", "b"], + # [pa.Check.str_contains("a"), "faa", "foo"], + # [pa.Check.str_startswith("a"), "ab", "ba"], + # [pa.Check.str_endswith("a"), "ba", "ab"], + # [pa.Check.str_length(1, 2), "a", ""], + ], +) +def test_check_comparison_operators(check, valid, invalid): + """Test simple comparison operators.""" + valid_check_result = check(cudf.Series([valid] * 3)) + invalid_check_result = check(cudf.Series([invalid] * 3)) + assert valid_check_result.check_passed + assert not invalid_check_result.check_passed + + +def test_check_decorators(): + # pylint: disable=missing-class-docstring + """Test that pandera decorators work with koalas.""" + in_schema = pa.DataFrameSchema({"a": pa.Column(int)}) + out_schema = in_schema.add_columns({"b": pa.Column(int)}) + + # pylint: disable=too-few-public-methods + class InSchema(pa.SchemaModel): + a: pa.typing.cudf.Series[int] + + class OutSchema(InSchema): + b: pa.typing.cudf.Series[int] + + @pa.check_input(in_schema) + @pa.check_output(out_schema) + def function_check_input_output(df: cudf.DataFrame) -> cudf.DataFrame: + df["b"] = df["a"] + 1 + return df + + @pa.check_input(in_schema) + @pa.check_output(out_schema) + def function_check_input_output_invalid( + df: cudf.DataFrame, + ) -> cudf.DataFrame: + return df + + @pa.check_io(df=in_schema, out=out_schema) + def function_check_io(df: cudf.DataFrame) -> cudf.DataFrame: + df["b"] = df["a"] + 1 + return df + + @pa.check_io(df=in_schema, out=out_schema) + def function_check_io_invalid(df: cudf.DataFrame) -> cudf.DataFrame: + return df + + @pa.check_types + def function_check_types( + df: pa.typing.cudf.DataFrame[InSchema], + ) -> pa.typing.cudf.DataFrame[OutSchema]: + df["b"] = df["a"] + 1 + return df + + @pa.check_types + def function_check_types_invalid( + df: pa.typing.cudf.DataFrame[InSchema], + ) -> pa.typing.cudf.DataFrame[OutSchema]: + return df + + valid_df = cudf.DataFrame({"a": [1, 2, 3]}) + invalid_df = cudf.DataFrame({"b": [1, 2, 3]}) + + function_check_input_output(valid_df) + function_check_io(valid_df) + function_check_types(valid_df) + + for fn in ( + function_check_input_output, + function_check_input_output_invalid, + function_check_io, + function_check_io_invalid, + function_check_types, + function_check_types_invalid, + ): + with pytest.raises(pa.errors.SchemaError): + fn(invalid_df) + + for fn in ( + function_check_input_output_invalid, + function_check_io_invalid, + function_check_types_invalid, + ): + with pytest.raises(pa.errors.SchemaError): + fn(valid_df) + + +# pylint: disable=too-few-public-methods +class InitSchema(pa.SchemaModel): + """Schema used for dataframe initialization.""" + + col1: Series[int] + col2: Series[float] + col3: Series[str] + index: Index[int] + + +def test_init_cudf_dataframe(): + """Test initialization of pandas.typing.dask.DataFrame with Schema.""" + assert isinstance( + DataFrame[InitSchema]({"col1": [1], "col2": [1.0], "col3": ["1"]}), + DataFrame, + ) + + +@pytest.mark.parametrize( + "invalid_data", + [ + {"col1": [1.0], "col2": [1.0], "col3": ["1"]}, + {"col1": [1], "col2": [1], "col3": ["1"]}, + {"col1": [1], "col2": [1.0], "col3": [1]}, + {"col1": [1]}, + ], +) +def test_init_cudf_dataframe_errors(invalid_data): + """Test errors from initializing a pandas.typing.DataFrame with Schema.""" + with pytest.raises(pa.errors.SchemaError): + DataFrame[InitSchema](invalid_data) From 28730689c680c0cd74e172555cc32d73ae6b598f Mon Sep 17 00:00:00 2001 From: Philippe Prados Date: Fri, 21 Oct 2022 10:25:51 +0200 Subject: [PATCH 6/8] Fix strings tests --- pandera/core/pandas/checks.py | 12 +++ pandera/errors.py | 3 +- pandera/typing/cudf.py | 11 +-- tests/conftest.py | 3 + tests/cudf/conftest.py | 9 -- tests/cudf/test_schemas_on_cudf.py | 132 +++++++++++++++------------ tests/modin/test_schemas_on_modin.py | 2 +- 7 files changed, 97 insertions(+), 75 deletions(-) delete mode 100644 tests/cudf/conftest.py diff --git a/pandera/core/pandas/checks.py b/pandera/core/pandas/checks.py index e9ab11dc3..9598da7d0 100644 --- a/pandera/core/pandas/checks.py +++ b/pandera/core/pandas/checks.py @@ -300,6 +300,9 @@ def str_matches( :param pattern: Regular expression pattern to use for matching :param kwargs: key-word arguments passed into the `Check` initializer. """ + if data.__module__.startswith("cudf"): + # This should be in its own backend implementation + return data.str.match(cast(str, pattern)) return data.str.match(cast(str, pattern), na=False) @@ -317,6 +320,9 @@ def str_contains( :param pattern: Regular expression pattern to use for searching :param kwargs: key-word arguments passed into the `Check` initializer. """ + if data.__module__.startswith("cudf"): + # This should be in its own backend implementation + return data.str.contains(cast(str, pattern)) return data.str.contains(cast(str, pattern), na=False) @@ -330,6 +336,9 @@ def str_startswith(data: PandasData, string: str) -> PandasData: :param string: String all values should start with :param kwargs: key-word arguments passed into the `Check` initializer. """ + if data.__module__.startswith("cudf"): + # This should be in its own backend implementation + return data.str.startswith(string) return data.str.startswith(string, na=False) @@ -342,6 +351,9 @@ def str_endswith(data: PandasData, string: str) -> PandasData: :param string: String all values should end with :param kwargs: key-word arguments passed into the `Check` initializer. """ + if data.__module__.startswith("cudf"): + # This should be in its own backend implementation + return data.str.endswith(string, na=False) return data.str.endswith(string, na=False) diff --git a/pandera/errors.py b/pandera/errors.py index c98e3219a..35b77a770 100644 --- a/pandera/errors.py +++ b/pandera/errors.py @@ -290,8 +290,7 @@ def _parse_schema_errors(schema_errors: List[Dict[str, Any]]): ] elif any( - type(x).__module__.startswith("cudf") - for x in check_failure_cases + type(x).__module__.startswith("cudf") for x in check_failure_cases ): # pylint: disable=import-outside-toplevel # The current version of cudf is not compatible with sort_values() of strings. diff --git a/pandera/typing/cudf.py b/pandera/typing/cudf.py index 8efcd6411..a49f57e79 100644 --- a/pandera/typing/cudf.py +++ b/pandera/typing/cudf.py @@ -23,7 +23,6 @@ except ImportError: ModelField = Any # type: ignore - # pylint:disable=too-few-public-methods class Index(IndexBase, cudf.Index, Generic[GenericDtype]): """Representation of pandas.Index, only used for type annotation. @@ -31,7 +30,6 @@ class Index(IndexBase, cudf.Index, Generic[GenericDtype]): *new in 0.5.0* """ - # pylint:disable=too-few-public-methods class Series(SeriesBase, cudf.Series, Generic[GenericDtype]): # type: ignore """Representation of pandas.Series, only used for type annotation. @@ -40,6 +38,7 @@ class Series(SeriesBase, cudf.Series, Generic[GenericDtype]): # type: ignore """ if hasattr(pd.Series, "__class_getitem__") and _GenericAlias: + def __class_getitem__(cls, item): """Define this to override the patch that pyspark.pandas performs on pandas. https://github.com/apache/spark/blob/master/python/pyspark/pandas/__init__.py#L124-L144 @@ -47,14 +46,12 @@ def __class_getitem__(cls, item): _type_check(item, "Parameters to generic types must be types.") return _GenericAlias(cls, item) - # pylint:disable=invalid-name if TYPE_CHECKING: T = TypeVar("T") # pragma: no cover else: T = Schema - # pylint:disable=too-few-public-methods class DataFrame(DataFrameBase, cudf.DataFrame, Generic[T]): """ @@ -64,6 +61,7 @@ class DataFrame(DataFrameBase, cudf.DataFrame, Generic[T]): """ if hasattr(pd.DataFrame, "__class_getitem__") and _GenericAlias: + def __class_getitem__(cls, item): """Define this to override the patch that pyspark.pandas performs on pandas. https://github.com/apache/spark/blob/master/python/pyspark/pandas/__init__.py#L124-L144 @@ -162,7 +160,9 @@ def _get_schema(cls, field: ModelField): return schema_model, schema @classmethod - def pydantic_validate(cls, obj: Any, field: ModelField) -> pd.DataFrame: + def pydantic_validate( + cls, obj: Any, field: ModelField + ) -> pd.DataFrame: """ Verify that the input can be converted into a pandas dataframe that meets all schema requirements. @@ -177,7 +177,6 @@ def pydantic_validate(cls, obj: Any, field: ModelField) -> pd.DataFrame: return cls.to_format(valid_data, schema_model.__config__) - CUDF_INSTALLED = True except ImportError: CUDF_INSTALLED = False diff --git a/tests/conftest.py b/tests/conftest.py index 07e327a63..87fb9459b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,6 +2,9 @@ import os +# pylint: disable=unused-import +from tests.core.checks_fixtures import custom_check_teardown + try: # pylint: disable=unused-import import hypothesis # noqa F401 diff --git a/tests/cudf/conftest.py b/tests/cudf/conftest.py deleted file mode 100644 index 852eea3fb..000000000 --- a/tests/cudf/conftest.py +++ /dev/null @@ -1,9 +0,0 @@ -"""Registers fixtures for core""" - -import os - -import pytest - -# pylint: disable=unused-import -from tests.core.checks_fixtures import custom_check_teardown # noqa - diff --git a/tests/cudf/test_schemas_on_cudf.py b/tests/cudf/test_schemas_on_cudf.py index 2199c889f..5e0260649 100644 --- a/tests/cudf/test_schemas_on_cudf.py +++ b/tests/cudf/test_schemas_on_cudf.py @@ -10,11 +10,8 @@ import pandera as pa from pandera import extensions from pandera.engines import numpy_engine, pandas_engine -from pandera.typing.modin import DataFrame, Index, Series, modin_version +from pandera.typing.modin import DataFrame, Index, Series from tests.strategies.test_strategies import NULLABLE_DTYPES -from tests.strategies.test_strategies import ( - SUPPORTED_DTYPES as SUPPORTED_STRATEGY_DTYPES, -) from tests.strategies.test_strategies import ( UNSUPPORTED_DTYPE_CLS as UNSUPPORTED_STRATEGY_DTYPE_CLS, ) @@ -26,26 +23,10 @@ hypothesis = MagicMock() st = MagicMock() - UNSUPPORTED_STRATEGY_DTYPE_CLS = set(UNSUPPORTED_STRATEGY_DTYPE_CLS) UNSUPPORTED_STRATEGY_DTYPE_CLS.add(numpy_engine.Object) -TEST_DTYPES_ON_CUDF = [] -# pylint: disable=redefined-outer-name -# for dtype_cls in pandas_engine.Engine.get_registered_dtypes(): -# if ( -# dtype_cls in UNSUPPORTED_STRATEGY_DTYPE_CLS -# or ( -# pandas_engine.Engine.dtype(dtype_cls) -# not in SUPPORTED_STRATEGY_DTYPES -# ) -# or not ( -# pandas_engine.GEOPANDAS_INSTALLED -# and dtype_cls == pandas_engine.Geometry -# ) -# ): -# continue -# TEST_DTYPES_ON_CUDF.append(pandas_engine.Engine.dtype(dtype_cls)) +TEST_DTYPES_ON_CUDF: typing.List[str] = [] @pytest.mark.parametrize("coerce", [True, False]) @@ -55,7 +36,8 @@ def test_dataframe_schema_case(coerce): { "int_column": pa.Column(int, pa.Check.ge(0)), "float_column": pa.Column(float, pa.Check.le(0)), - # cudf not implemented "str_column": pa.Column(str, pa.Check.isin(list("abcde"))), + # not implemented in cudf 22.08.00 + # "str_column": pa.Column(str, pa.Check.isin(list("abcde"))), }, coerce=coerce, ) @@ -63,7 +45,7 @@ def test_dataframe_schema_case(coerce): { "int_column": range(10), "float_column": [float(-x) for x in range(10)], - # cudf not implemented "str_column": list("aabbcceedd"), + # "str_column": list("aabbcceedd"), # not implemented in cudf 22.08.00 } ) assert isinstance(schema.validate(cdf), cudf.DataFrame) @@ -126,12 +108,14 @@ def test_field_schema_dtypes( int, float, bool, - # str, - # pandas_engine.DateTime, + # str, # not implemented in cudf 22.08.00 + # pandas_engine.DateTime, # not implemented in cudf 22.08.00 ], ) @pytest.mark.parametrize("coerce", [True, False]) -@pytest.mark.parametrize("schema_cls", [pa.Index]) +@pytest.mark.parametrize( + "schema_cls", [pa.Index] +) # Multiindex not implemented in cudf 22.08.00 @hypothesis.given(st.data()) def test_index_dtypes( dtype: pandas_engine.DataType, @@ -197,6 +181,39 @@ def test_nullable( nonnullable_schema(ks_null_sample) +# def test_unique(): # cudf 22.08.00 not implemented `df.duplicated()` +# """Test uniqueness checks on modin dataframes.""" +# schema = pa.DataFrameSchema({"field": pa.Column(int)}, unique=["field"]) +# column_schema = pa.Column(int, unique=True, name="field") +# series_schema = pa.SeriesSchema(int, unique=True, name="field") +# +# data_unique = cudf.DataFrame({"field": [1, 2, 3]}) +# data_non_unique = cudf.DataFrame({"field": [1, 1, 1]}) +# +# assert isinstance(schema(data_unique), cudf.DataFrame) +# assert isinstance(column_schema(data_unique), cudf.DataFrame) +# assert isinstance(series_schema(data_unique["field"]), cudf.Series) +# +# with pytest.raises(pa.errors.SchemaError, match="columns .+ not unique"): +# schema(data_non_unique) +# with pytest.raises( +# pa.errors.SchemaError, match="series .+ contains duplicate values" +# ): +# column_schema(data_non_unique) +# with pytest.raises( +# pa.errors.SchemaError, match="series .+ contains duplicate values" +# ): +# series_schema(data_non_unique["field"]) +# +# schema.unique = None +# column_schema.unique = False +# series_schema.unique = False +# +# assert isinstance(schema(data_non_unique), mpd.DataFrame) +# assert isinstance(column_schema(data_non_unique), mpd.DataFrame) +# assert isinstance(series_schema(data_non_unique["field"]), mpd.Series) + + def test_required_column(): """Test the required column raises error.""" required_schema = pa.DataFrameSchema( @@ -214,7 +231,9 @@ def test_required_column(): schema(cudf.DataFrame({"another_field": [1, 2, 3]})) -@pytest.mark.parametrize("from_dtype", [bool, float, int]) +@pytest.mark.parametrize( + "from_dtype", [bool, float, int] +) # str not implemented in cudf 22.08.00 @pytest.mark.parametrize("to_dtype", [float, int, str, bool]) @hypothesis.given(st.data()) def test_dtype_coercion(from_dtype, to_dtype, data): @@ -265,26 +284,26 @@ def test_strict_schema(): def test_custom_checks(custom_check_teardown): """Test that custom checks can be executed.""" - # @extensions.register_check_method(statistics=["value"]) - # def cudf_eq(cudf_obj, *, value): # PPR - # return cudf_obj == value - # - # custom_schema = pa.DataFrameSchema( - # {"field": pa.Column(checks=pa.Check(lambda s: s == 0, name="custom"))} - # ) - # - # custom_registered_schema = pa.DataFrameSchema( - # {"field": pa.Column(checks=pa.Check.cudf_eq(0))} - # ) - # - # for schema in (custom_schema, custom_registered_schema): - # schema(cudf.DataFrame({"field": [0] * 100})) - # - # try: - # schema(cudf.DataFrame({"field": [-1] * 100})) - # except pa.errors.SchemaError as err: - # assert (err.failure_cases["failure_case"] == -1).all() - pass + @extensions.register_check_method(statistics=["value"]) + def cudf_eq(cudf_obj, *, value): + return cudf_obj == value + + custom_schema = pa.DataFrameSchema( + {"field": pa.Column(checks=pa.Check(lambda s: s == 0, name="custom"))} + ) + + custom_registered_schema = pa.DataFrameSchema( + {"field": pa.Column(checks=pa.Check.cudf_eq(0))} + ) + + for schema in (custom_schema, custom_registered_schema): + schema(cudf.DataFrame({"field": [0] * 100})) + + try: + schema(cudf.DataFrame({"field": [-1] * 100})) + except pa.errors.SchemaError as err: + assert (err.failure_cases["failure_case"] == -1).all() + def test_schema_model(): # pylint: disable=missing-class-docstring @@ -300,14 +319,14 @@ class Schema(pa.SchemaModel): { "int_field": [1, 2, 3], "float_field": [-1.1, -2.1, -3.1], - # "in_field": [1, 2, 3], + # "str_field": ["a", "b", "c"], # not implemented in cudf 22.08.00 } ) invalid_df = cudf.DataFrame( { "int_field": [-1], "field_field": [1.0], - # "in_field": [4], + # "str_field": ["d"], # not implemented in cudf 22.08.00 } ) @@ -332,14 +351,13 @@ class Schema(pa.SchemaModel): [pa.Check.lt(0), -1, 0], [pa.Check.le(0), 0, 1], [pa.Check.in_range(0, 10), 5, -1], - # FIXME: a valider - # [pa.Check.isin(["a"]), "a", "b"], - # [pa.Check.notin(["a"]), "b", "a"], - # [pa.Check.str_matches("^a$"), "a", "b"], - # [pa.Check.str_contains("a"), "faa", "foo"], - # [pa.Check.str_startswith("a"), "ab", "ba"], - # [pa.Check.str_endswith("a"), "ba", "ab"], - # [pa.Check.str_length(1, 2), "a", ""], + # [pa.Check.isin(["a"]), "a", "b"], # Not impleted by cudf + # [pa.Check.notin(["a"]), "b", "a"], # Not impleted by cudf + [pa.Check.str_matches("^a$"), "a", "b"], + [pa.Check.str_contains("a"), "faa", "foo"], + [pa.Check.str_startswith("a"), "ab", "ba"], + [pa.Check.str_endswith("a"), "ba", "ab"], + [pa.Check.str_length(1, 2), "a", ""], ], ) def test_check_comparison_operators(check, valid, invalid): diff --git a/tests/modin/test_schemas_on_modin.py b/tests/modin/test_schemas_on_modin.py index d8393d39e..f24346da1 100644 --- a/tests/modin/test_schemas_on_modin.py +++ b/tests/modin/test_schemas_on_modin.py @@ -251,7 +251,7 @@ def test_required_column(): schema(mpd.DataFrame({"another_field": [1, 2, 3]})) -@pytest.mark.parametrize("from_dtype", [str]) +@pytest.mark.parametrize("from_dtype", [bool, float, int, str]) @pytest.mark.parametrize("to_dtype", [float, int, str, bool]) @hypothesis.given(st.data()) def test_dtype_coercion(from_dtype, to_dtype, data): From aee1811dcf1b0cee6c32a901f8feb98e4ec62c08 Mon Sep 17 00:00:00 2001 From: cosmicBboy Date: Thu, 26 Jan 2023 22:00:04 -0500 Subject: [PATCH 7/8] move cudf_accessor.py --- pandera/{ => accessors}/cudf_accessor.py | 0 pandera/core/extensions.py | 2 +- pandera/typing/common.py | 4 ++-- pandera/typing/cudf.py | 12 +++++++++--- tests/cudf/test_cudf_accessor.py | 2 +- 5 files changed, 13 insertions(+), 7 deletions(-) rename pandera/{ => accessors}/cudf_accessor.py (100%) diff --git a/pandera/cudf_accessor.py b/pandera/accessors/cudf_accessor.py similarity index 100% rename from pandera/cudf_accessor.py rename to pandera/accessors/cudf_accessor.py diff --git a/pandera/core/extensions.py b/pandera/core/extensions.py index f440971b3..ddecd4605 100644 --- a/pandera/core/extensions.py +++ b/pandera/core/extensions.py @@ -3,7 +3,7 @@ import warnings from enum import Enum from functools import partial, wraps -from inspect import signature, Parameter, Signature, _empty +from inspect import signature, Parameter, Signature, _empty # type: ignore from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union import pandas as pd diff --git a/pandera/typing/common.py b/pandera/typing/common.py index 0490de0df..fd5c4b6c9 100644 --- a/pandera/typing/common.py +++ b/pandera/typing/common.py @@ -95,7 +95,7 @@ else: GenericDtype = TypeVar( # type: ignore "GenericDtype", - bound=Union[ + bound=Union[ # type: ignore bool, int, str, @@ -134,7 +134,7 @@ ], ) -DataFrameModel = TypeVar("Schema", bound="DataFrameModel") # type: ignore +DataFrameModel = TypeVar("DataFrameModel", bound="DataFrameModel") # type: ignore # pylint:disable=invalid-name diff --git a/pandera/typing/cudf.py b/pandera/typing/cudf.py index a49f57e79..ba9e38bdd 100644 --- a/pandera/typing/cudf.py +++ b/pandera/typing/cudf.py @@ -6,8 +6,14 @@ import pandas as pd -from ..errors import SchemaError, SchemaInitError -from .common import DataFrameBase, GenericDtype, IndexBase, Schema, SeriesBase +from pandera.errors import SchemaError, SchemaInitError +from pandera.typing.common import ( + DataFrameBase, + GenericDtype, + IndexBase, + DataFrameModel, + SeriesBase, +) from .formats import Formats try: @@ -50,7 +56,7 @@ def __class_getitem__(cls, item): if TYPE_CHECKING: T = TypeVar("T") # pragma: no cover else: - T = Schema + T = DataFrameModel # pylint:disable=too-few-public-methods class DataFrame(DataFrameBase, cudf.DataFrame, Generic[T]): diff --git a/tests/cudf/test_cudf_accessor.py b/tests/cudf/test_cudf_accessor.py index 8f9a1b861..693a913fd 100644 --- a/tests/cudf/test_cudf_accessor.py +++ b/tests/cudf/test_cudf_accessor.py @@ -3,7 +3,7 @@ import pytest -from pandera import cudf_accessor +from pandera.accessors import cudf_accessor # pylint: disable=too-few-public-methods From 9192da71fdb879d1ba8f24b9e004228a583ef607 Mon Sep 17 00:00:00 2001 From: cosmicBboy Date: Thu, 26 Jan 2023 22:02:06 -0500 Subject: [PATCH 8/8] uncomment and skip test --- tests/cudf/test_schemas_on_cudf.py | 63 +++++++++++++++--------------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/tests/cudf/test_schemas_on_cudf.py b/tests/cudf/test_schemas_on_cudf.py index 5e0260649..40acb290a 100644 --- a/tests/cudf/test_schemas_on_cudf.py +++ b/tests/cudf/test_schemas_on_cudf.py @@ -181,37 +181,38 @@ def test_nullable( nonnullable_schema(ks_null_sample) -# def test_unique(): # cudf 22.08.00 not implemented `df.duplicated()` -# """Test uniqueness checks on modin dataframes.""" -# schema = pa.DataFrameSchema({"field": pa.Column(int)}, unique=["field"]) -# column_schema = pa.Column(int, unique=True, name="field") -# series_schema = pa.SeriesSchema(int, unique=True, name="field") -# -# data_unique = cudf.DataFrame({"field": [1, 2, 3]}) -# data_non_unique = cudf.DataFrame({"field": [1, 1, 1]}) -# -# assert isinstance(schema(data_unique), cudf.DataFrame) -# assert isinstance(column_schema(data_unique), cudf.DataFrame) -# assert isinstance(series_schema(data_unique["field"]), cudf.Series) -# -# with pytest.raises(pa.errors.SchemaError, match="columns .+ not unique"): -# schema(data_non_unique) -# with pytest.raises( -# pa.errors.SchemaError, match="series .+ contains duplicate values" -# ): -# column_schema(data_non_unique) -# with pytest.raises( -# pa.errors.SchemaError, match="series .+ contains duplicate values" -# ): -# series_schema(data_non_unique["field"]) -# -# schema.unique = None -# column_schema.unique = False -# series_schema.unique = False -# -# assert isinstance(schema(data_non_unique), mpd.DataFrame) -# assert isinstance(column_schema(data_non_unique), mpd.DataFrame) -# assert isinstance(series_schema(data_non_unique["field"]), mpd.Series) +@pytest.mark.skip(reason="cudf 22.08.00 not implemented `df.duplicated()`") +def test_unique(): + """Test uniqueness checks on modin dataframes.""" + schema = pa.DataFrameSchema({"field": pa.Column(int)}, unique=["field"]) + column_schema = pa.Column(int, unique=True, name="field") + series_schema = pa.SeriesSchema(int, unique=True, name="field") + + data_unique = cudf.DataFrame({"field": [1, 2, 3]}) + data_non_unique = cudf.DataFrame({"field": [1, 1, 1]}) + + assert isinstance(schema(data_unique), cudf.DataFrame) + assert isinstance(column_schema(data_unique), cudf.DataFrame) + assert isinstance(series_schema(data_unique["field"]), cudf.Series) + + with pytest.raises(pa.errors.SchemaError, match="columns .+ not unique"): + schema(data_non_unique) + with pytest.raises( + pa.errors.SchemaError, match="series .+ contains duplicate values" + ): + column_schema(data_non_unique) + with pytest.raises( + pa.errors.SchemaError, match="series .+ contains duplicate values" + ): + series_schema(data_non_unique["field"]) + + schema.unique = None + column_schema.unique = False + series_schema.unique = False + + assert isinstance(schema(data_non_unique), cudf.DataFrame) + assert isinstance(column_schema(data_non_unique), cudf.DataFrame) + assert isinstance(series_schema(data_non_unique["field"]), cudf.Series) def test_required_column():