From b3e934c8c012ac627d3c4f34489b22c13ee475be Mon Sep 17 00:00:00 2001 From: brandon-b-miller Date: Wed, 22 May 2024 13:42:02 -0700 Subject: [PATCH] use explicit mappings --- python/cudf/cudf/_lib/pylibcudf/column.pyx | 25 ++++++++-- python/cudf/cudf/_lib/pylibcudf/interop.pyx | 42 ++++++++++------ python/cudf/cudf/_lib/pylibcudf/types.pyx | 53 -------------------- python/cudf/cudf/_lib/types.pyx | 55 +++++++++++++++++++-- 4 files changed, 100 insertions(+), 75 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx index 38f4c2691d2..e726eca154f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/column.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx @@ -22,8 +22,6 @@ import functools import numpy as np -from .types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES - cdef class Column: """A container of nullable device data as a column of elements. @@ -353,8 +351,27 @@ cdef class ListColumnView: @functools.cache def _datatype_from_dtype_desc(desc): - desc = np.dtype(desc) - mapping = SUPPORTED_NUMPY_TO_LIBCUDF_TYPES + mapping = { + 'u1': type_id.UINT8, + 'u2': type_id.UINT16, + 'u4': type_id.UINT32, + 'u8': type_id.UINT64, + 'i1': type_id.INT8, + 'i2': type_id.INT16, + 'i4': type_id.INT32, + 'i8': type_id.INT64, + 'f4': type_id.FLOAT32, + 'f8': type_id.FLOAT64, + 'b1': type_id.BOOL8, + 'M8[s]': type_id.TIMESTAMP_SECONDS, + 'M8[ms]': type_id.TIMESTAMP_MILLISECONDS, + 'M8[us]': type_id.TIMESTAMP_MICROSECONDS, + 'M8[ns]': type_id.TIMESTAMP_NANOSECONDS, + 'm8[s]': type_id.DURATION_SECONDS, + 'm8[ms]': type_id.DURATION_MILLISECONDS, + 'm8[us]': type_id.DURATION_MICROSECONDS, + 'm8[ns]': type_id.DURATION_NANOSECONDS, + } if desc not in mapping: raise ValueError(f"Unsupported dtype: {desc}") return DataType(mapping[desc]) diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx index 00d2454e864..befdc4463cd 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx @@ -28,18 +28,37 @@ from cudf._lib.pylibcudf.libcudf.wrappers.decimals cimport ( scale_type, ) -import numpy as np - -from .types import ( - LIBCUDF_TO_SUPPORTED_NUMPY_TYPES, - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES, -) - from .column cimport Column from .scalar cimport Scalar from .table cimport Table from .types cimport DataType, type_id +ARROW_TO_PYLIBCUDF_TYPES = { + pa.int8(): type_id.INT8, + pa.int16(): type_id.INT16, + pa.int32(): type_id.INT32, + pa.int64(): type_id.INT64, + pa.uint8(): type_id.UINT8, + pa.uint16(): type_id.UINT16, + pa.uint32(): type_id.UINT32, + pa.uint64(): type_id.UINT64, + pa.float32(): type_id.FLOAT32, + pa.float64(): type_id.FLOAT64, + pa.bool_(): type_id.BOOL8, + pa.string(): type_id.STRING, + pa.duration('s'): type_id.DURATION_SECONDS, + pa.duration('ms'): type_id.DURATION_MILLISECONDS, + pa.duration('us'): type_id.DURATION_MICROSECONDS, + pa.duration('ns'): type_id.DURATION_NANOSECONDS, + pa.timestamp('s'): type_id.TIMESTAMP_SECONDS, + pa.timestamp('ms'): type_id.TIMESTAMP_MILLISECONDS, + pa.timestamp('us'): type_id.TIMESTAMP_MICROSECONDS, + pa.timestamp('ns'): type_id.TIMESTAMP_NANOSECONDS, +} + +LIBCUDF_TO_ARROW_TYPES = { + v: k for k, v in ARROW_TO_PYLIBCUDF_TYPES.items() +} cdef column_metadata _metadata_to_libcudf(metadata): """Convert a ColumnMetadata object to C++ column_metadata. @@ -93,10 +112,7 @@ def _from_arrow_datatype(pyarrow_object): elif isinstance(pyarrow_object, pa.ListType): return DataType(type_id.LIST) else: - return DataType( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES.get( - np.dtype(pyarrow_object.to_pandas_dtype())) - ) + return DataType(ARROW_TO_PYLIBCUDF_TYPES.get(pyarrow_object)) @from_arrow.register(pa.Table) @@ -201,9 +217,7 @@ def _to_arrow_datatype(cudf_object): raise ValueError( f"Cannot convert {cudf_object} to PyArrow type" ) - return pa.from_numpy_dtype( - LIBCUDF_TO_SUPPORTED_NUMPY_TYPES.get(cudf_object) - ) + return ARROW_TO_PYLIBCUDF_TYPES.get(cudf_object.id()) @to_arrow.register(Table) diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx index 42c8324999e..841a600df06 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx @@ -15,8 +15,6 @@ from cudf._lib.pylibcudf.libcudf.types import null_order as NullOrder # no-cyth from cudf._lib.pylibcudf.libcudf.types import order as Order # no-cython-lint, isort:skip from cudf._lib.pylibcudf.libcudf.types import sorted as Sorted # no-cython-lint, isort:skip -import numpy as np - cdef class DataType: """Indicator for the logical data type of an element in a column. @@ -66,54 +64,3 @@ cdef class DataType: cdef DataType ret = DataType.__new__(DataType, type_id.EMPTY) ret.c_obj = dt return ret - -SUPPORTED_NUMPY_TO_LIBCUDF_TYPES = { - np.dtype("int8"): TypeId.INT8, - np.dtype("int16"): TypeId.INT16, - np.dtype("int32"): TypeId.INT32, - np.dtype("int64"): TypeId.INT64, - np.dtype("uint8"): TypeId.UINT8, - np.dtype("uint16"): TypeId.UINT16, - np.dtype("uint32"): TypeId.UINT32, - np.dtype("uint64"): TypeId.UINT64, - np.dtype("float32"): TypeId.FLOAT32, - np.dtype("float64"): TypeId.FLOAT64, - np.dtype("datetime64[s]"): TypeId.TIMESTAMP_SECONDS, - np.dtype("datetime64[ms]"): TypeId.TIMESTAMP_MILLISECONDS, - np.dtype("datetime64[us]"): TypeId.TIMESTAMP_MICROSECONDS, - np.dtype("datetime64[ns]"): TypeId.TIMESTAMP_NANOSECONDS, - np.dtype("object"): TypeId.STRING, - np.dtype("bool"): TypeId.BOOL8, - np.dtype("timedelta64[s]"): TypeId.DURATION_SECONDS, - np.dtype("timedelta64[ms]"): TypeId.DURATION_MILLISECONDS, - np.dtype("timedelta64[us]"): TypeId.DURATION_MICROSECONDS, - np.dtype("timedelta64[ns]"): TypeId.DURATION_NANOSECONDS, -} - -LIBCUDF_TO_SUPPORTED_NUMPY_TYPES = { - # There's no equivalent to EMPTY in cudf. We translate EMPTY - # columns from libcudf to ``int8`` columns of all nulls in Python. - # ``int8`` is chosen because it uses the least amount of memory. - TypeId.EMPTY: np.dtype("int8"), - TypeId.INT8: np.dtype("int8"), - TypeId.INT16: np.dtype("int16"), - TypeId.INT32: np.dtype("int32"), - TypeId.INT64: np.dtype("int64"), - TypeId.UINT8: np.dtype("uint8"), - TypeId.UINT16: np.dtype("uint16"), - TypeId.UINT32: np.dtype("uint32"), - TypeId.UINT64: np.dtype("uint64"), - TypeId.FLOAT32: np.dtype("float32"), - TypeId.FLOAT64: np.dtype("float64"), - TypeId.BOOL8: np.dtype("bool"), - TypeId.TIMESTAMP_SECONDS: np.dtype("datetime64[s]"), - TypeId.TIMESTAMP_MILLISECONDS: np.dtype("datetime64[ms]"), - TypeId.TIMESTAMP_MICROSECONDS: np.dtype("datetime64[us]"), - TypeId.TIMESTAMP_NANOSECONDS: np.dtype("datetime64[ns]"), - TypeId.DURATION_SECONDS: np.dtype("timedelta64[s]"), - TypeId.DURATION_MILLISECONDS: np.dtype("timedelta64[ms]"), - TypeId.DURATION_MICROSECONDS: np.dtype("timedelta64[us]"), - TypeId.DURATION_NANOSECONDS: np.dtype("timedelta64[ns]"), - TypeId.STRING: np.dtype("object"), - TypeId.STRUCT: np.dtype("object"), -} diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index 48ff0170bc3..895e1afc502 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -20,10 +20,6 @@ from cudf._lib.types cimport ( import cudf from cudf._lib import pylibcudf -from cudf._lib.pylibcudf.types import ( - LIBCUDF_TO_SUPPORTED_NUMPY_TYPES, - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES, -) size_type_dtype = np.dtype("int32") @@ -79,11 +75,62 @@ class TypeId(IntEnum): STRUCT = libcudf_types.type_id.STRUCT +SUPPORTED_NUMPY_TO_LIBCUDF_TYPES = { + np.dtype("int8"): TypeId.INT8, + np.dtype("int16"): TypeId.INT16, + np.dtype("int32"): TypeId.INT32, + np.dtype("int64"): TypeId.INT64, + np.dtype("uint8"): TypeId.UINT8, + np.dtype("uint16"): TypeId.UINT16, + np.dtype("uint32"): TypeId.UINT32, + np.dtype("uint64"): TypeId.UINT64, + np.dtype("float32"): TypeId.FLOAT32, + np.dtype("float64"): TypeId.FLOAT64, + np.dtype("datetime64[s]"): TypeId.TIMESTAMP_SECONDS, + np.dtype("datetime64[ms]"): TypeId.TIMESTAMP_MILLISECONDS, + np.dtype("datetime64[us]"): TypeId.TIMESTAMP_MICROSECONDS, + np.dtype("datetime64[ns]"): TypeId.TIMESTAMP_NANOSECONDS, + np.dtype("object"): TypeId.STRING, + np.dtype("bool"): TypeId.BOOL8, + np.dtype("timedelta64[s]"): TypeId.DURATION_SECONDS, + np.dtype("timedelta64[ms]"): TypeId.DURATION_MILLISECONDS, + np.dtype("timedelta64[us]"): TypeId.DURATION_MICROSECONDS, + np.dtype("timedelta64[ns]"): TypeId.DURATION_NANOSECONDS, +} + SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = { k: pylibcudf.TypeId(v).value for k, v in SUPPORTED_NUMPY_TO_LIBCUDF_TYPES.items() } +LIBCUDF_TO_SUPPORTED_NUMPY_TYPES = { + # There's no equivalent to EMPTY in cudf. We translate EMPTY + # columns from libcudf to ``int8`` columns of all nulls in Python. + # ``int8`` is chosen because it uses the least amount of memory. + TypeId.EMPTY: np.dtype("int8"), + TypeId.INT8: np.dtype("int8"), + TypeId.INT16: np.dtype("int16"), + TypeId.INT32: np.dtype("int32"), + TypeId.INT64: np.dtype("int64"), + TypeId.UINT8: np.dtype("uint8"), + TypeId.UINT16: np.dtype("uint16"), + TypeId.UINT32: np.dtype("uint32"), + TypeId.UINT64: np.dtype("uint64"), + TypeId.FLOAT32: np.dtype("float32"), + TypeId.FLOAT64: np.dtype("float64"), + TypeId.BOOL8: np.dtype("bool"), + TypeId.TIMESTAMP_SECONDS: np.dtype("datetime64[s]"), + TypeId.TIMESTAMP_MILLISECONDS: np.dtype("datetime64[ms]"), + TypeId.TIMESTAMP_MICROSECONDS: np.dtype("datetime64[us]"), + TypeId.TIMESTAMP_NANOSECONDS: np.dtype("datetime64[ns]"), + TypeId.DURATION_SECONDS: np.dtype("timedelta64[s]"), + TypeId.DURATION_MILLISECONDS: np.dtype("timedelta64[ms]"), + TypeId.DURATION_MICROSECONDS: np.dtype("timedelta64[us]"), + TypeId.DURATION_NANOSECONDS: np.dtype("timedelta64[ns]"), + TypeId.STRING: np.dtype("object"), + TypeId.STRUCT: np.dtype("object"), +} + PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES = { pylibcudf.TypeId(k).value: v for k, v in LIBCUDF_TO_SUPPORTED_NUMPY_TYPES.items()