Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add compatibility with Cudf #969

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ dependencies:
- modin
- protobuf <= 3.20.3

# cudf extra
- cudf

# dask extra
- dask
- distributed
Expand Down
98 changes: 98 additions & 0 deletions pandera/accessors/cudf_accessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
"""Custom accessor functionality for modin.

Source code adapted from pyspark.pandas implementation:
https://spark.apache.org/docs/3.2.0/api/python/reference/pyspark.pandas/api/pyspark.pandas.extensions.register_dataframe_accessor.html?highlight=register_dataframe_accessor#pyspark.pandas.extensions.register_dataframe_accessor
"""

import warnings

from pandera.pandas_accessor import (
PanderaDataFrameAccessor,
PanderaSeriesAccessor,
)


# pylint: disable=too-few-public-methods
class CachedAccessor:
"""
Custom property-like object.

A descriptor for caching accessors:

:param name: Namespace that accessor's methods, properties, etc will be
accessed under, e.g. "foo" for a dataframe accessor yields the accessor
``df.foo``
:param cls: Class with the extension methods.

For accessor, the class's __init__ method assumes that you are registering
an accessor for one of ``Series``, ``DataFrame``, or ``Index``.
"""

def __init__(self, name, accessor):
self._name = name
self._accessor = accessor

def __get__(self, obj, cls):
if obj is None: # pragma: no cover
return self._accessor
accessor_obj = self._accessor(obj)
object.__setattr__(obj, self._name, accessor_obj)
return accessor_obj


def _register_accessor(name, cls):
"""
Register a custom accessor on {class} objects.

:param name: Name under which the accessor should be registered. A warning
is issued if this name conflicts with a preexisting attribute.
:returns: A class decorator callable.
"""

def decorator(accessor):
if hasattr(cls, name):
msg = (
f"registration of accessor {accessor} under name '{name}' for "
"type {cls.__name__} is overriding a preexisting attribute "
"with the same name."
)

warnings.warn(
msg,
UserWarning,
stacklevel=2,
)
setattr(cls, name, CachedAccessor(name, accessor))
return accessor

return decorator


def register_dataframe_accessor(name):
"""
Register a custom accessor with a DataFrame

:param name: name used when calling the accessor after its registered
:returns: a class decorator callable.
"""
# pylint: disable=import-outside-toplevel
from cudf import DataFrame

return _register_accessor(name, DataFrame)


def register_series_accessor(name):
"""
Register a custom accessor with a Series object

:param name: name used when calling the accessor after its registered
:returns: a callable class decorator
"""
# pylint: disable=import-outside-toplevel
from cudf import Series

return _register_accessor(name, Series)


register_dataframe_accessor("pandera")(PanderaDataFrameAccessor)
register_series_accessor("pandera")(PanderaSeriesAccessor)
2 changes: 1 addition & 1 deletion pandera/core/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import warnings
from enum import Enum
from functools import partial, wraps
from inspect import signature, Parameter, Signature, _empty
from inspect import signature, Parameter, Signature, _empty # type: ignore
from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union

import pandas as pd
Expand Down
12 changes: 12 additions & 0 deletions pandera/core/pandas/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,9 @@ def str_matches(
:param pattern: Regular expression pattern to use for matching
:param kwargs: key-word arguments passed into the `Check` initializer.
"""
if data.__module__.startswith("cudf"):
# This should be in its own backend implementation
return data.str.match(cast(str, pattern))
return data.str.match(cast(str, pattern), na=False)


Expand All @@ -317,6 +320,9 @@ def str_contains(
:param pattern: Regular expression pattern to use for searching
:param kwargs: key-word arguments passed into the `Check` initializer.
"""
if data.__module__.startswith("cudf"):
# This should be in its own backend implementation
return data.str.contains(cast(str, pattern))
return data.str.contains(cast(str, pattern), na=False)


Expand All @@ -330,6 +336,9 @@ def str_startswith(data: PandasData, string: str) -> PandasData:
:param string: String all values should start with
:param kwargs: key-word arguments passed into the `Check` initializer.
"""
if data.__module__.startswith("cudf"):
# This should be in its own backend implementation
return data.str.startswith(string)
return data.str.startswith(string, na=False)


Expand All @@ -342,6 +351,9 @@ def str_endswith(data: PandasData, string: str) -> PandasData:
:param string: String all values should end with
:param kwargs: key-word arguments passed into the `Check` initializer.
"""
if data.__module__.startswith("cudf"):
# This should be in its own backend implementation
return data.str.endswith(string, na=False)
return data.str.endswith(string, na=False)


Expand Down
8 changes: 8 additions & 0 deletions pandera/core/pandas/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,14 @@ def supported_types() -> SupportedTypes:
index_types.append(dd.Index)
except ImportError:
pass
try:
import cudf

table_types.append(cudf.DataFrame)
field_types.append(cudf.Series)
index_types.append(cudf.Index)
except ImportError:
pass

return SupportedTypes(
tuple(table_types),
Expand Down
15 changes: 15 additions & 0 deletions pandera/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,21 @@ def _parse_schema_errors(schema_errors: List[Dict[str, Any]]):
for x in check_failure_cases
]

elif any(
type(x).__module__.startswith("cudf") for x in check_failure_cases
):
# pylint: disable=import-outside-toplevel
# The current version of cudf is not compatible with sort_values() of strings.
# The workaround is to convert all the cuda dataframe to pandas.
import cudf

# concat_fn = cudf.concat
check_failure_cases = [
# x if isinstance(x, cudf.DataFrame) else cudf.DataFrame(x)
x.to_pandas() if isinstance(x, cudf.DataFrame) else x
for x in check_failure_cases
]

failure_cases = (
concat_fn(check_failure_cases)
.reset_index(drop=True)
Expand Down
7 changes: 6 additions & 1 deletion pandera/typing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from typing import Set, Type

from pandera.typing import dask, fastapi, geopandas, modin, pyspark
from pandera.typing import dask, fastapi, geopandas, modin, cudf, pyspark
from pandera.typing.common import (
BOOL,
INT8,
Expand Down Expand Up @@ -57,6 +57,11 @@
SERIES_TYPES.update({modin.Series})
INDEX_TYPES.update({modin.Index})

if cudf.CUDF_INSTALLED:
DATAFRAME_TYPES.update({cudf.DataFrame})
SERIES_TYPES.update({cudf.Series})
INDEX_TYPES.update({cudf.Index})

if pyspark.PYSPARK_INSTALLED:
DATAFRAME_TYPES.update({pyspark.DataFrame})
SERIES_TYPES.update({pyspark.Series})
Expand Down
4 changes: 2 additions & 2 deletions pandera/typing/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@
else:
GenericDtype = TypeVar( # type: ignore
"GenericDtype",
bound=Union[
bound=Union[ # type: ignore
bool,
int,
str,
Expand Down Expand Up @@ -134,7 +134,7 @@
],
)

DataFrameModel = TypeVar("Schema", bound="DataFrameModel") # type: ignore
DataFrameModel = TypeVar("DataFrameModel", bound="DataFrameModel") # type: ignore


# pylint:disable=invalid-name
Expand Down
Loading