Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT-#4746: Sync interchange protocol with recent API changes #4763

Merged
merged 12 commits into from
Aug 13, 2022
12 changes: 6 additions & 6 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ jobs:
modin/experimental/core/execution/native/implementations/omnisci_on_native/expr.py \
modin/experimental/core/execution/native/implementations/omnisci_on_native/omnisci_worker.py \
- run: python scripts/doc_checker.py modin/experimental/core/storage_formats/omnisci
- run: python scripts/doc_checker.py modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol
- run: python scripts/doc_checker.py modin/experimental/core/execution/native/implementations/omnisci_on_native/interchange/dataframe_protocol
- run: python scripts/doc_checker.py modin/experimental/batch/pipeline.py
- run: python scripts/doc_checker.py modin/logging

Expand Down Expand Up @@ -317,7 +317,7 @@ jobs:
- run: python -m pytest modin/test/test_partition_api.py
- run: python -m pytest modin/test/test_utils.py
- run: python -m pytest asv_bench/test/test_utils.py
- run: python -m pytest modin/test/exchange/dataframe_protocol/base
- run: python -m pytest modin/test/interchange/dataframe_protocol/base
- run: python -m pytest modin/test/test_logging.py
- uses: codecov/codecov-action@v2

Expand Down Expand Up @@ -422,8 +422,8 @@ jobs:
- run: MODIN_BENCHMARK_MODE=True pytest modin/pandas/test/internals/test_benchmark_mode.py
- run: pytest modin/experimental/core/execution/native/implementations/omnisci_on_native/test/test_dataframe.py
- run: pytest modin/pandas/test/test_io.py::TestCsv --verbose
- run: pytest modin/test/exchange/dataframe_protocol/test_general.py
- run: pytest modin/test/exchange/dataframe_protocol/omnisci
- run: pytest modin/test/interchange/dataframe_protocol/test_general.py
- run: pytest modin/test/interchange/dataframe_protocol/omnisci
- uses: codecov/codecov-action@v2

test-asv-benchmarks:
Expand Down Expand Up @@ -578,8 +578,8 @@ jobs:
- run: python -m pytest modin/pandas/test/test_io.py --verbose
- run: python -m pytest modin/experimental/pandas/test/test_io_exp.py
- run: pip install "dfsql>=0.4.2" "pyparsing<=2.4.7" && pytest modin/experimental/sql/test/test_sql.py
- run: pytest modin/test/exchange/dataframe_protocol/test_general.py
- run: pytest modin/test/exchange/dataframe_protocol/pandas/test_protocol.py
- run: pytest modin/test/interchange/dataframe_protocol/test_general.py
- run: pytest modin/test/interchange/dataframe_protocol/pandas/test_protocol.py
- uses: codecov/codecov-action@v2

test-experimental:
Expand Down
10 changes: 5 additions & 5 deletions .github/workflows/push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:
- run: python -m pytest modin/test/storage_formats/pandas/test_internals.py
- run: python -m pytest modin/test/test_envvar_npartitions.py
- run: python -m pytest modin/test/test_partition_api.py
- run: python -m pytest modin/test/exchange/dataframe_protocol/base
- run: python -m pytest modin/test/interchange/dataframe_protocol/base
- run: python -m pytest modin/test/test_logging.py
- uses: codecov/codecov-action@v2

Expand Down Expand Up @@ -132,8 +132,8 @@ jobs:
- run: pytest modin/test/storage_formats/omnisci/test_internals.py
- run: pytest modin/experimental/core/execution/native/implementations/omnisci_on_native/test/test_dataframe.py
- run: pytest modin/pandas/test/test_io.py::TestCsv
- run: pytest modin/test/exchange/dataframe_protocol/test_general.py
- run: pytest modin/test/exchange/dataframe_protocol/omnisci
- run: pytest modin/test/interchange/dataframe_protocol/test_general.py
- run: pytest modin/test/interchange/dataframe_protocol/omnisci
- uses: codecov/codecov-action@v2

test-all:
Expand Down Expand Up @@ -218,8 +218,8 @@ jobs:
- run: ./.github/workflows/sql_server/set_up_sql_server.sh
- run: python -m pytest modin/pandas/test/test_io.py
- run: python -m pytest modin/experimental/pandas/test/test_io_exp.py
- run: pytest modin/test/exchange/dataframe_protocol/test_general.py
- run: pytest modin/test/exchange/dataframe_protocol/pandas/test_protocol.py
- run: pytest modin/test/interchange/dataframe_protocol/test_general.py
- run: pytest modin/test/interchange/dataframe_protocol/pandas/test_protocol.py
- uses: codecov/codecov-action@v2

test-windows:
Expand Down
1 change: 1 addition & 0 deletions docs/release_notes/release_notes-0.16.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ Key Features and Updates
* FEAT-#4147: Add partial compatibility with Python 3.6 and pandas 1.1 (#4301)
* FEAT-#4569: Add error message when `read_` function defaults to pandas (#4647)
* FEAT-#4725: Make index and columns lazy in Modin DataFrame (#4726)
* FEAT-#4746: Sync interchange protocol with recent API changes (#4763)

Contributors
------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,37 @@
"""

from abc import ABC, abstractmethod
from typing import Optional, Iterable, Sequence, Any, Tuple, Dict
from typing import Optional, Iterable, Sequence, Any, Tuple, Dict, TypedDict

from .utils import DlpackDeviceType, DTypeKind, ColumnNullType


class ColumnBuffers(TypedDict): # noqa: GL08
# first element is a buffer containing the column data;
# second element is the data buffer's associated dtype
data: Tuple["ProtocolBuffer", Any]

# first element is a buffer containing mask values indicating missing data;
# second element is the mask value buffer's associated dtype.
# None if the null representation is not a bit or byte mask
validity: Optional[Tuple["ProtocolBuffer", Any]]

# first element is a buffer containing the offset values for
# variable-size binary data (e.g., variable-length strings);
# second element is the offsets buffer's associated dtype.
# None if the data buffer does not have an associated offsets buffer
offsets: Optional[Tuple["ProtocolBuffer", Any]]


class CategoricalDescription(TypedDict): # noqa: GL08
# whether the ordering of dictionary indices is semantically meaningful
is_ordered: bool
# whether a column-style mapping of categorical values to other objects exists
is_dictionary: bool
# None if not a column-style categorical.
categories: Optional["ProtocolColumn"]


class ProtocolBuffer(ABC):
"""
Data in the buffer is guaranteed to be contiguous in memory.
Expand Down Expand Up @@ -82,7 +108,7 @@ def __dlpack__(self):
pass

@abstractmethod
def __dlpack_device__(self) -> Tuple[DlpackDeviceType, int]:
def __dlpack_device__(self) -> Tuple[DlpackDeviceType, Optional[int]]:
"""
Device type and device ID for where the data in the buffer resides.

Expand Down Expand Up @@ -148,7 +174,6 @@ class ProtocolColumn(ABC):
so doesn't need its own version or ``__column__`` protocol.
"""

@property
@abstractmethod
def size(self) -> int:
"""
Expand All @@ -157,6 +182,9 @@ def size(self) -> int:
Corresponds to `DataFrame.num_rows()` if column is a single chunk;
equal to size of this current chunk otherwise.

Is a method rather than a property because it may cause a (potentially
expensive) computation for some dataframe implementations.

Returns
-------
int
Expand Down Expand Up @@ -220,12 +248,12 @@ def dtype(self) -> Tuple[DTypeKind, int, str, str]:

@property
@abstractmethod
def describe_categorical(self) -> Dict[str, Any]:
def describe_categorical(self) -> CategoricalDescription:
"""
If the dtype is categorical, there are two options.

- There are only values in the data buffer.
- There is a separate dictionary-style encoding for categorical values.
- There is a separate non-categorical Column encoding categorical values.

TBD: are there any other in-memory representations that are needed?

Expand All @@ -235,14 +263,15 @@ def describe_categorical(self) -> Dict[str, Any]:
Content of returned dict:
- "is_ordered" : bool, whether the ordering of dictionary indices is
semantically meaningful.
- "is_dictionary" : bool, whether a dictionary-style mapping of
- "is_dictionary" : bool, whether a mapping of
categorical values to other objects exists
- "mapping" : dict, Python-level only (e.g. ``{int: str}``).
None if not a dictionary-style categorical.
- "categories" : Column representing the (implicit) mapping of indices to
category values (e.g. an array of cat1, cat2, ...).
None if not a dictionary-style categorical.

Raises
------
``RuntimeError`` if the dtype is not categorical.
``TypeError`` if the dtype is not categorical.
"""
pass

Expand All @@ -253,12 +282,7 @@ def describe_null(self) -> Tuple[ColumnNullType, Any]:
Return the missing value (or "null") representation the column dtype uses.

Return as a tuple ``(kind, value)``.
* Kind:
- 0 : non-nullable
- 1 : NaN/NaT
- 2 : sentinel value
- 3 : bit mask
- 4 : byte mask
* Kind: ColumnNullType
* Value : if kind is "sentinel value", the actual value. If kind is a bit
mask or a byte mask, the value (0 or 1) indicating a missing value. None
otherwise.
Expand Down Expand Up @@ -338,7 +362,7 @@ def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["ProtocolColumn
pass

@abstractmethod
def get_buffers(self) -> Dict[str, Any]:
def get_buffers(self) -> ColumnBuffers:
"""
Return a dictionary containing the underlying buffers.

Expand Down Expand Up @@ -376,18 +400,22 @@ class ProtocolDataframe(ABC):
to the dataframe interchange protocol specification.
"""

version = 0 # version of the protocol

@abstractmethod
def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
def __dataframe__(
self, nan_as_null: bool = False, allow_copy: bool = True
) -> "ProtocolDataframe":
"""
Get a new dataframe exchange object.
Construct a new dataframe interchange object, potentially changing the parameters.

See more about the protocol in https://data-apis.org/dataframe-protocol/latest/index.html.

Parameters
----------
nan_as_null : bool, default: False
A keyword intended for the consumer to tell the producer
to overwrite null values in the data with ``NaN`` (or ``NaT``).
to overwrite null values in the data with ``NaN``.
This currently has no effect; once support for nullable extension
dtypes is added, this value should be propagated to columns.
allow_copy : bool, default: True
Expand All @@ -405,7 +433,7 @@ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):

@property
@abstractmethod
def metadata(self):
def metadata(self) -> Dict[str, Any]:
"""
Get the metadata for the data frame, as a dictionary with string keys.

Expand Down Expand Up @@ -435,7 +463,7 @@ def num_columns(self) -> int:
pass

@abstractmethod
def num_rows(self) -> int:
def num_rows(self) -> Optional[int]:
"""
Return the number of rows in the ProtocolDataframe, if available.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,18 @@
class DTypeKind(enum.IntEnum): # noqa PR01
"""
Integer enum for data types.

Attributes
----------
INT : int
Matches to integer data type.
Matches to signed integer data type.
UINT : int
Matches to unsigned integer data type.
FLOAT : int
Matches to floating point data type.
BOOL : int
Matches to boolean data type.
STRING : int
Matches to string data type.
Matches to string data type (UTF-8 encoded).
DATETIME : int
Matches to datetime data type.
CATEGORICAL : int
Expand All @@ -59,15 +58,14 @@ class DTypeKind(enum.IntEnum): # noqa PR01
class ColumnNullType(enum.IntEnum): # noqa PR01
"""
Integer enum for null type representation.

Attributes
----------
NON_NULLABLE : int
Non-nullable column.
USE_NAN : int
NaN/NaT value.
Use explicit float NaN value.
USE_SENTINEL : int
Sentinel value besides NaN/NaT.
Sentinel value besides NaN.
vnlitvinov marked this conversation as resolved.
Show resolved Hide resolved
USE_BITMASK : int
The bit is set/unset representing a null on a certain position.
USE_BYTEMASK : int
Expand Down
14 changes: 8 additions & 6 deletions modin/core/dataframe/pandas/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from pandas.core.indexes.api import ensure_index, Index, RangeIndex
from pandas.core.dtypes.common import is_numeric_dtype, is_list_like
from pandas._libs.lib import no_default
from typing import List, Hashable, Optional, Callable, Union, Dict
from typing import List, Hashable, Optional, Callable, Union, Dict, TYPE_CHECKING

from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler
from modin.error_message import ErrorMessage
Expand All @@ -36,9 +36,11 @@
Axis,
JoinType,
)
from modin.core.dataframe.base.exchange.dataframe_protocol.dataframe import (
ProtocolDataframe,
)

if TYPE_CHECKING:
YarShev marked this conversation as resolved.
Show resolved Hide resolved
from modin.core.dataframe.base.interchange.dataframe_protocol.dataframe import (
ProtocolDataframe,
)
from modin.pandas.indexing import is_range_like
from modin.pandas.utils import is_full_grab_slice, check_both_not_none
from modin.logging import ClassLogger
Expand Down Expand Up @@ -2974,7 +2976,7 @@ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
ProtocolDataframe
A dataframe object following the dataframe protocol specification.
"""
from modin.core.dataframe.pandas.exchange.dataframe_protocol.dataframe import (
from modin.core.dataframe.pandas.interchange.dataframe_protocol.dataframe import (
PandasProtocolDataframe,
)

Expand Down Expand Up @@ -3007,7 +3009,7 @@ def from_dataframe(cls, df: "ProtocolDataframe") -> "PandasDataframe":
"`df` does not support DataFrame exchange protocol, i.e. `__dataframe__` method"
)

from modin.core.dataframe.pandas.exchange.dataframe_protocol.from_dataframe import (
from modin.core.dataframe.pandas.interchange.dataframe_protocol.from_dataframe import (
from_dataframe_to_pandas,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
import numpy as np
from typing import Tuple

from modin.core.dataframe.base.exchange.dataframe_protocol.dataframe import (
from modin.core.dataframe.base.interchange.dataframe_protocol.dataframe import (
ProtocolBuffer,
)
from modin.utils import _inherit_docstrings
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,11 @@
import pandas

from modin.utils import _inherit_docstrings
from modin.core.dataframe.base.exchange.dataframe_protocol.dataframe import (
from modin.core.dataframe.base.interchange.dataframe_protocol.dataframe import (
CategoricalDescription,
ProtocolColumn,
)
from modin.core.dataframe.base.exchange.dataframe_protocol.utils import (
from modin.core.dataframe.base.interchange.dataframe_protocol.utils import (
DTypeKind,
pandas_dtype_to_arrow_c,
ColumnNullType,
Expand Down Expand Up @@ -109,7 +110,6 @@ def __init__(self, column: PandasDataframe, allow_copy: bool = True) -> None:
self._col = column
self._allow_copy = allow_copy

@property
def size(self) -> int:
return len(self._col.index)

Expand Down Expand Up @@ -187,18 +187,21 @@ def _dtype_from_primitive_pandas_dtype(
)

@property
def describe_categorical(self) -> Dict[str, Any]:
if not self.dtype[0] == DTypeKind.CATEGORICAL:
raise RuntimeError(
def describe_categorical(self) -> CategoricalDescription:
if self.dtype[0] != DTypeKind.CATEGORICAL:
raise TypeError(
"`describe_categorical only works on a column with "
+ "categorical dtype!"
)

pandas_series = self._col.to_pandas().squeeze(axis=1)
cat_frame = type(self._col).from_pandas(
pandas.DataFrame({"cat": pandas_series.cat.categories})
YarShev marked this conversation as resolved.
Show resolved Hide resolved
)
return {
"is_ordered": pandas_series.cat.ordered,
"is_dictionary": True,
"mapping": dict(zip(pandas_series.cat.codes, pandas_series.cat.categories)),
"categories": PandasProtocolColumn(cat_frame, self._allow_copy),
}

@property
Expand Down Expand Up @@ -262,7 +265,7 @@ def get_chunks(
self, n_chunks: Optional[int] = None
) -> Iterable["PandasProtocolColumn"]:
cur_n_chunks = self.num_chunks()
n_rows = self.size
n_rows = self.size()
if n_chunks is None or n_chunks == cur_n_chunks:
cum_row_lengths = np.cumsum([0] + self._col._row_lengths)
for i in range(len(cum_row_lengths) - 1):
Expand Down
Loading