modin-project · YarShev · Aug 13, 2022 · Aug 3, 2022 · Aug 3, 2022 · Aug 3, 2022
@@ -139,7 +139,7 @@ jobs:
             modin/experimental/core/execution/native/implementations/omnisci_on_native/expr.py \
             modin/experimental/core/execution/native/implementations/omnisci_on_native/omnisci_worker.py \
       - run: python scripts/doc_checker.py modin/experimental/core/storage_formats/omnisci
-      - run: python scripts/doc_checker.py modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol
+      - run: python scripts/doc_checker.py modin/experimental/core/execution/native/implementations/omnisci_on_native/interchange/dataframe_protocol
       - run: python scripts/doc_checker.py modin/experimental/batch/pipeline.py
       - run: python scripts/doc_checker.py modin/logging
 
@@ -317,7 +317,7 @@ jobs:
       - run: python -m pytest modin/test/test_partition_api.py
       - run: python -m pytest modin/test/test_utils.py
       - run: python -m pytest asv_bench/test/test_utils.py
-      - run: python -m pytest modin/test/exchange/dataframe_protocol/base
+      - run: python -m pytest modin/test/interchange/dataframe_protocol/base
       - run: python -m pytest modin/test/test_logging.py
       - uses: codecov/codecov-action@v2
 
@@ -422,8 +422,8 @@ jobs:
       - run: MODIN_BENCHMARK_MODE=True pytest modin/pandas/test/internals/test_benchmark_mode.py
       - run: pytest modin/experimental/core/execution/native/implementations/omnisci_on_native/test/test_dataframe.py
       - run: pytest modin/pandas/test/test_io.py::TestCsv --verbose
-      - run: pytest modin/test/exchange/dataframe_protocol/test_general.py
-      - run: pytest modin/test/exchange/dataframe_protocol/omnisci
+      - run: pytest modin/test/interchange/dataframe_protocol/test_general.py
+      - run: pytest modin/test/interchange/dataframe_protocol/omnisci
       - uses: codecov/codecov-action@v2
 
   test-asv-benchmarks:
@@ -578,8 +578,8 @@ jobs:
       - run: python -m pytest modin/pandas/test/test_io.py --verbose
       - run: python -m pytest modin/experimental/pandas/test/test_io_exp.py
       - run: pip install "dfsql>=0.4.2" "pyparsing<=2.4.7" && pytest modin/experimental/sql/test/test_sql.py
-      - run: pytest modin/test/exchange/dataframe_protocol/test_general.py
-      - run: pytest modin/test/exchange/dataframe_protocol/pandas/test_protocol.py
+      - run: pytest modin/test/interchange/dataframe_protocol/test_general.py
+      - run: pytest modin/test/interchange/dataframe_protocol/pandas/test_protocol.py
       - uses: codecov/codecov-action@v2
 
   test-experimental:

@@ -40,7 +40,7 @@ jobs:
       - run: python -m pytest modin/test/storage_formats/pandas/test_internals.py
       - run: python -m pytest modin/test/test_envvar_npartitions.py
       - run: python -m pytest modin/test/test_partition_api.py
-      - run: python -m pytest modin/test/exchange/dataframe_protocol/base
+      - run: python -m pytest modin/test/interchange/dataframe_protocol/base
       - run: python -m pytest modin/test/test_logging.py
       - uses: codecov/codecov-action@v2
 
@@ -132,8 +132,8 @@ jobs:
       - run: pytest modin/test/storage_formats/omnisci/test_internals.py
       - run: pytest modin/experimental/core/execution/native/implementations/omnisci_on_native/test/test_dataframe.py
       - run: pytest modin/pandas/test/test_io.py::TestCsv
-      - run: pytest modin/test/exchange/dataframe_protocol/test_general.py
-      - run: pytest modin/test/exchange/dataframe_protocol/omnisci
+      - run: pytest modin/test/interchange/dataframe_protocol/test_general.py
+      - run: pytest modin/test/interchange/dataframe_protocol/omnisci
       - uses: codecov/codecov-action@v2
 
   test-all:
@@ -218,8 +218,8 @@ jobs:
       - run: ./.github/workflows/sql_server/set_up_sql_server.sh
       - run: python -m pytest modin/pandas/test/test_io.py
       - run: python -m pytest modin/experimental/pandas/test/test_io_exp.py
-      - run: pytest modin/test/exchange/dataframe_protocol/test_general.py
-      - run: pytest modin/test/exchange/dataframe_protocol/pandas/test_protocol.py
+      - run: pytest modin/test/interchange/dataframe_protocol/test_general.py
+      - run: pytest modin/test/interchange/dataframe_protocol/pandas/test_protocol.py
       - uses: codecov/codecov-action@v2
 
   test-windows:

@@ -87,6 +87,7 @@ Key Features and Updates
   * FEAT-#4147: Add partial compatibility with Python 3.6 and pandas 1.1 (#4301)
   * FEAT-#4569: Add error message when `read_` function defaults to pandas (#4647)
   * FEAT-#4725: Make index and columns lazy in Modin DataFrame (#4726)
+  * FEAT-#4746: Sync interchange protocol with recent API changes (#4763)
 
 Contributors
 ------------

@@ -18,11 +18,37 @@
 """
 
 from abc import ABC, abstractmethod
-from typing import Optional, Iterable, Sequence, Any, Tuple, Dict
+from typing import Optional, Iterable, Sequence, Any, Tuple, Dict, TypedDict
 
 from .utils import DlpackDeviceType, DTypeKind, ColumnNullType
 
 
+class ColumnBuffers(TypedDict):  # noqa: GL08
+    # first element is a buffer containing the column data;
+    # second element is the data buffer's associated dtype
+    data: Tuple["ProtocolBuffer", Any]
+
+    # first element is a buffer containing mask values indicating missing data;
+    # second element is the mask value buffer's associated dtype.
+    # None if the null representation is not a bit or byte mask
+    validity: Optional[Tuple["ProtocolBuffer", Any]]
+
+    # first element is a buffer containing the offset values for
+    # variable-size binary data (e.g., variable-length strings);
+    # second element is the offsets buffer's associated dtype.
+    # None if the data buffer does not have an associated offsets buffer
+    offsets: Optional[Tuple["ProtocolBuffer", Any]]
+
+
+class CategoricalDescription(TypedDict):  # noqa: GL08
+    # whether the ordering of dictionary indices is semantically meaningful
+    is_ordered: bool
+    # whether a column-style mapping of categorical values to other objects exists
+    is_dictionary: bool
+    # None if not a column-style categorical.
+    categories: Optional["ProtocolColumn"]
+
+
 class ProtocolBuffer(ABC):
     """
     Data in the buffer is guaranteed to be contiguous in memory.
@@ -82,7 +108,7 @@ def __dlpack__(self):
         pass
 
     @abstractmethod
-    def __dlpack_device__(self) -> Tuple[DlpackDeviceType, int]:
+    def __dlpack_device__(self) -> Tuple[DlpackDeviceType, Optional[int]]:
         """
         Device type and device ID for where the data in the buffer resides.
 
@@ -148,7 +174,6 @@ class ProtocolColumn(ABC):
     so doesn't need its own version or ``__column__`` protocol.
     """
 
-    @property
     @abstractmethod
     def size(self) -> int:
         """
@@ -157,6 +182,9 @@ def size(self) -> int:
         Corresponds to `DataFrame.num_rows()` if column is a single chunk;
         equal to size of this current chunk otherwise.
 
+        Is a method rather than a property because it may cause a (potentially
+        expensive) computation for some dataframe implementations.
+
         Returns
         -------
         int
@@ -220,12 +248,12 @@ def dtype(self) -> Tuple[DTypeKind, int, str, str]:
 
     @property
     @abstractmethod
-    def describe_categorical(self) -> Dict[str, Any]:
+    def describe_categorical(self) -> CategoricalDescription:
         """
         If the dtype is categorical, there are two options.
 
         - There are only values in the data buffer.
-        - There is a separate dictionary-style encoding for categorical values.
+        - There is a separate non-categorical Column encoding categorical values.
 
         TBD: are there any other in-memory representations that are needed?
 
@@ -235,14 +263,15 @@ def describe_categorical(self) -> Dict[str, Any]:
             Content of returned dict:
             - "is_ordered" : bool, whether the ordering of dictionary indices is
                              semantically meaningful.
-            - "is_dictionary" : bool, whether a dictionary-style mapping of
+            - "is_dictionary" : bool, whether a mapping of
                                 categorical values to other objects exists
-            - "mapping" : dict, Python-level only (e.g. ``{int: str}``).
-                          None if not a dictionary-style categorical.
+            - "categories" : Column representing the (implicit) mapping of indices to
+                             category values (e.g. an array of cat1, cat2, ...).
+                             None if not a dictionary-style categorical.
 
         Raises
         ------
-        ``RuntimeError`` if the dtype is not categorical.
+        ``TypeError`` if the dtype is not categorical.
         """
         pass
 
@@ -253,12 +282,7 @@ def describe_null(self) -> Tuple[ColumnNullType, Any]:
         Return the missing value (or "null") representation the column dtype uses.
 
         Return as a tuple ``(kind, value)``.
-        * Kind:
-            - 0 : non-nullable
-            - 1 : NaN/NaT
-            - 2 : sentinel value
-            - 3 : bit mask
-            - 4 : byte mask
+        * Kind: ColumnNullType
         * Value : if kind is "sentinel value", the actual value. If kind is a bit
           mask or a byte mask, the value (0 or 1) indicating a missing value. None
           otherwise.
@@ -338,7 +362,7 @@ def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["ProtocolColumn
         pass
 
     @abstractmethod
-    def get_buffers(self) -> Dict[str, Any]:
+    def get_buffers(self) -> ColumnBuffers:
         """
         Return a dictionary containing the underlying buffers.
 
@@ -376,18 +400,22 @@ class ProtocolDataframe(ABC):
     to the dataframe interchange protocol specification.
     """
 
+    version = 0  # version of the protocol
+
     @abstractmethod
-    def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
+    def __dataframe__(
+        self, nan_as_null: bool = False, allow_copy: bool = True
+    ) -> "ProtocolDataframe":
         """
-        Get a new dataframe exchange object.
+        Construct a new dataframe interchange object, potentially changing the parameters.
 
         See more about the protocol in https://data-apis.org/dataframe-protocol/latest/index.html.
 
         Parameters
         ----------
         nan_as_null : bool, default: False
             A keyword intended for the consumer to tell the producer
-            to overwrite null values in the data with ``NaN`` (or ``NaT``).
+            to overwrite null values in the data with ``NaN``.
             This currently has no effect; once support for nullable extension
             dtypes is added, this value should be propagated to columns.
         allow_copy : bool, default: True
@@ -405,7 +433,7 @@ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
 
     @property
     @abstractmethod
-    def metadata(self):
+    def metadata(self) -> Dict[str, Any]:
         """
         Get the metadata for the data frame, as a dictionary with string keys.
 
@@ -435,7 +463,7 @@ def num_columns(self) -> int:
         pass
 
     @abstractmethod
-    def num_rows(self) -> int:
+    def num_rows(self) -> Optional[int]:
         """
         Return the number of rows in the ProtocolDataframe, if available.
 

@@ -28,19 +28,18 @@
 class DTypeKind(enum.IntEnum):  # noqa PR01
     """
     Integer enum for data types.
-
     Attributes
     ----------
     INT : int
-        Matches to integer data type.
+        Matches to signed integer data type.
     UINT : int
         Matches to unsigned integer data type.
     FLOAT : int
         Matches to floating point data type.
     BOOL : int
         Matches to boolean data type.
     STRING : int
-        Matches to string data type.
+        Matches to string data type (UTF-8 encoded).
     DATETIME : int
         Matches to datetime data type.
     CATEGORICAL : int
@@ -59,15 +58,14 @@ class DTypeKind(enum.IntEnum):  # noqa PR01
 class ColumnNullType(enum.IntEnum):  # noqa PR01
     """
     Integer enum for null type representation.
-
     Attributes
     ----------
     NON_NULLABLE : int
         Non-nullable column.
     USE_NAN : int
-        NaN/NaT value.
+        Use explicit float NaN value.
     USE_SENTINEL : int
-        Sentinel value besides NaN/NaT.
+        Sentinel value besides NaN.
     USE_BITMASK : int
         The bit is set/unset representing a null on a certain position.
     USE_BYTEMASK : int

@@ -24,7 +24,7 @@
 from pandas.core.indexes.api import ensure_index, Index, RangeIndex
 from pandas.core.dtypes.common import is_numeric_dtype, is_list_like
 from pandas._libs.lib import no_default
-from typing import List, Hashable, Optional, Callable, Union, Dict
+from typing import List, Hashable, Optional, Callable, Union, Dict, TYPE_CHECKING
 
 from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler
 from modin.error_message import ErrorMessage
@@ -36,9 +36,11 @@
     Axis,
     JoinType,
 )
-from modin.core.dataframe.base.exchange.dataframe_protocol.dataframe import (
-    ProtocolDataframe,
-)
+
+if TYPE_CHECKING:
+    from modin.core.dataframe.base.interchange.dataframe_protocol.dataframe import (
+        ProtocolDataframe,
+    )
 from modin.pandas.indexing import is_range_like
 from modin.pandas.utils import is_full_grab_slice, check_both_not_none
 from modin.logging import ClassLogger
@@ -2973,7 +2975,7 @@ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
         ProtocolDataframe
             A dataframe object following the dataframe protocol specification.
         """
-        from modin.core.dataframe.pandas.exchange.dataframe_protocol.dataframe import (
+        from modin.core.dataframe.pandas.interchange.dataframe_protocol.dataframe import (
             PandasProtocolDataframe,
         )
 
@@ -3006,7 +3008,7 @@ def from_dataframe(cls, df: "ProtocolDataframe") -> "PandasDataframe":
                 "`df` does not support DataFrame exchange protocol, i.e. `__dataframe__` method"
             )
 
-        from modin.core.dataframe.pandas.exchange.dataframe_protocol.from_dataframe import (
+        from modin.core.dataframe.pandas.interchange.dataframe_protocol.from_dataframe import (
             from_dataframe_to_pandas,
         )
 

@@ -29,7 +29,7 @@
 import numpy as np
 from typing import Tuple
 
-from modin.core.dataframe.base.exchange.dataframe_protocol.dataframe import (
+from modin.core.dataframe.base.interchange.dataframe_protocol.dataframe import (
     ProtocolBuffer,
 )
 from modin.utils import _inherit_docstrings

@@ -30,10 +30,11 @@
 import pandas
 
 from modin.utils import _inherit_docstrings
-from modin.core.dataframe.base.exchange.dataframe_protocol.dataframe import (
+from modin.core.dataframe.base.interchange.dataframe_protocol.dataframe import (
+    CategoricalDescription,
     ProtocolColumn,
 )
-from modin.core.dataframe.base.exchange.dataframe_protocol.utils import (
+from modin.core.dataframe.base.interchange.dataframe_protocol.utils import (
     DTypeKind,
     pandas_dtype_to_arrow_c,
     ColumnNullType,
@@ -109,7 +110,6 @@ def __init__(self, column: PandasDataframe, allow_copy: bool = True) -> None:
         self._col = column
         self._allow_copy = allow_copy
 
-    @property
     def size(self) -> int:
         return len(self._col.index)
 
@@ -187,18 +187,21 @@ def _dtype_from_primitive_pandas_dtype(
         )
 
     @property
-    def describe_categorical(self) -> Dict[str, Any]:
-        if not self.dtype[0] == DTypeKind.CATEGORICAL:
-            raise RuntimeError(
+    def describe_categorical(self) -> CategoricalDescription:
+        if self.dtype[0] != DTypeKind.CATEGORICAL:
+            raise TypeError(
                 "`describe_categorical only works on a column with "
                 + "categorical dtype!"
             )
 
         pandas_series = self._col.to_pandas().squeeze(axis=1)
+        cat_frame = self._col.from_pandas(
+            pandas.DataFrame({"cat": pandas_series.cat.categories})
+        )
         return {
             "is_ordered": pandas_series.cat.ordered,
             "is_dictionary": True,
-            "mapping": dict(zip(pandas_series.cat.codes, pandas_series.cat.categories)),
+            "categories": PandasProtocolColumn(cat_frame, self._allow_copy),
         }
 
     @property
@@ -262,7 +265,7 @@ def get_chunks(
         self, n_chunks: Optional[int] = None
     ) -> Iterable["PandasProtocolColumn"]:
         cur_n_chunks = self.num_chunks()
-        n_rows = self.size
+        n_rows = self.size()
         if n_chunks is None or n_chunks == cur_n_chunks:
             cum_row_lengths = np.cumsum([0] + self._col._row_lengths)
             for i in range(len(cum_row_lengths) - 1):