Skip to content

Commit

Permalink
use explicit mappings
Browse files Browse the repository at this point in the history
  • Loading branch information
brandon-b-miller committed May 22, 2024
1 parent 3a478be commit b3e934c
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 75 deletions.
25 changes: 21 additions & 4 deletions python/cudf/cudf/_lib/pylibcudf/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@ import functools

import numpy as np

from .types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES


cdef class Column:
"""A container of nullable device data as a column of elements.
Expand Down Expand Up @@ -353,8 +351,27 @@ cdef class ListColumnView:

@functools.cache
def _datatype_from_dtype_desc(desc):
desc = np.dtype(desc)
mapping = SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
mapping = {
'u1': type_id.UINT8,
'u2': type_id.UINT16,
'u4': type_id.UINT32,
'u8': type_id.UINT64,
'i1': type_id.INT8,
'i2': type_id.INT16,
'i4': type_id.INT32,
'i8': type_id.INT64,
'f4': type_id.FLOAT32,
'f8': type_id.FLOAT64,
'b1': type_id.BOOL8,
'M8[s]': type_id.TIMESTAMP_SECONDS,
'M8[ms]': type_id.TIMESTAMP_MILLISECONDS,
'M8[us]': type_id.TIMESTAMP_MICROSECONDS,
'M8[ns]': type_id.TIMESTAMP_NANOSECONDS,
'm8[s]': type_id.DURATION_SECONDS,
'm8[ms]': type_id.DURATION_MILLISECONDS,
'm8[us]': type_id.DURATION_MICROSECONDS,
'm8[ns]': type_id.DURATION_NANOSECONDS,
}
if desc not in mapping:
raise ValueError(f"Unsupported dtype: {desc}")
return DataType(mapping[desc])
Expand Down
42 changes: 28 additions & 14 deletions python/cudf/cudf/_lib/pylibcudf/interop.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,37 @@ from cudf._lib.pylibcudf.libcudf.wrappers.decimals cimport (
scale_type,
)

import numpy as np

from .types import (
LIBCUDF_TO_SUPPORTED_NUMPY_TYPES,
SUPPORTED_NUMPY_TO_LIBCUDF_TYPES,
)

from .column cimport Column
from .scalar cimport Scalar
from .table cimport Table
from .types cimport DataType, type_id

ARROW_TO_PYLIBCUDF_TYPES = {
pa.int8(): type_id.INT8,
pa.int16(): type_id.INT16,
pa.int32(): type_id.INT32,
pa.int64(): type_id.INT64,
pa.uint8(): type_id.UINT8,
pa.uint16(): type_id.UINT16,
pa.uint32(): type_id.UINT32,
pa.uint64(): type_id.UINT64,
pa.float32(): type_id.FLOAT32,
pa.float64(): type_id.FLOAT64,
pa.bool_(): type_id.BOOL8,
pa.string(): type_id.STRING,
pa.duration('s'): type_id.DURATION_SECONDS,
pa.duration('ms'): type_id.DURATION_MILLISECONDS,
pa.duration('us'): type_id.DURATION_MICROSECONDS,
pa.duration('ns'): type_id.DURATION_NANOSECONDS,
pa.timestamp('s'): type_id.TIMESTAMP_SECONDS,
pa.timestamp('ms'): type_id.TIMESTAMP_MILLISECONDS,
pa.timestamp('us'): type_id.TIMESTAMP_MICROSECONDS,
pa.timestamp('ns'): type_id.TIMESTAMP_NANOSECONDS,
}

LIBCUDF_TO_ARROW_TYPES = {
v: k for k, v in ARROW_TO_PYLIBCUDF_TYPES.items()
}

cdef column_metadata _metadata_to_libcudf(metadata):
"""Convert a ColumnMetadata object to C++ column_metadata.
Expand Down Expand Up @@ -93,10 +112,7 @@ def _from_arrow_datatype(pyarrow_object):
elif isinstance(pyarrow_object, pa.ListType):
return DataType(type_id.LIST)
else:
return DataType(
SUPPORTED_NUMPY_TO_LIBCUDF_TYPES.get(
np.dtype(pyarrow_object.to_pandas_dtype()))
)
return DataType(ARROW_TO_PYLIBCUDF_TYPES.get(pyarrow_object))


@from_arrow.register(pa.Table)
Expand Down Expand Up @@ -201,9 +217,7 @@ def _to_arrow_datatype(cudf_object):
raise ValueError(
f"Cannot convert {cudf_object} to PyArrow type"
)
return pa.from_numpy_dtype(
LIBCUDF_TO_SUPPORTED_NUMPY_TYPES.get(cudf_object)
)
return ARROW_TO_PYLIBCUDF_TYPES.get(cudf_object.id())


@to_arrow.register(Table)
Expand Down
53 changes: 0 additions & 53 deletions python/cudf/cudf/_lib/pylibcudf/types.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@ from cudf._lib.pylibcudf.libcudf.types import null_order as NullOrder # no-cyth
from cudf._lib.pylibcudf.libcudf.types import order as Order # no-cython-lint, isort:skip
from cudf._lib.pylibcudf.libcudf.types import sorted as Sorted # no-cython-lint, isort:skip

import numpy as np


cdef class DataType:
"""Indicator for the logical data type of an element in a column.
Expand Down Expand Up @@ -66,54 +64,3 @@ cdef class DataType:
cdef DataType ret = DataType.__new__(DataType, type_id.EMPTY)
ret.c_obj = dt
return ret

SUPPORTED_NUMPY_TO_LIBCUDF_TYPES = {
np.dtype("int8"): TypeId.INT8,
np.dtype("int16"): TypeId.INT16,
np.dtype("int32"): TypeId.INT32,
np.dtype("int64"): TypeId.INT64,
np.dtype("uint8"): TypeId.UINT8,
np.dtype("uint16"): TypeId.UINT16,
np.dtype("uint32"): TypeId.UINT32,
np.dtype("uint64"): TypeId.UINT64,
np.dtype("float32"): TypeId.FLOAT32,
np.dtype("float64"): TypeId.FLOAT64,
np.dtype("datetime64[s]"): TypeId.TIMESTAMP_SECONDS,
np.dtype("datetime64[ms]"): TypeId.TIMESTAMP_MILLISECONDS,
np.dtype("datetime64[us]"): TypeId.TIMESTAMP_MICROSECONDS,
np.dtype("datetime64[ns]"): TypeId.TIMESTAMP_NANOSECONDS,
np.dtype("object"): TypeId.STRING,
np.dtype("bool"): TypeId.BOOL8,
np.dtype("timedelta64[s]"): TypeId.DURATION_SECONDS,
np.dtype("timedelta64[ms]"): TypeId.DURATION_MILLISECONDS,
np.dtype("timedelta64[us]"): TypeId.DURATION_MICROSECONDS,
np.dtype("timedelta64[ns]"): TypeId.DURATION_NANOSECONDS,
}

LIBCUDF_TO_SUPPORTED_NUMPY_TYPES = {
# There's no equivalent to EMPTY in cudf. We translate EMPTY
# columns from libcudf to ``int8`` columns of all nulls in Python.
# ``int8`` is chosen because it uses the least amount of memory.
TypeId.EMPTY: np.dtype("int8"),
TypeId.INT8: np.dtype("int8"),
TypeId.INT16: np.dtype("int16"),
TypeId.INT32: np.dtype("int32"),
TypeId.INT64: np.dtype("int64"),
TypeId.UINT8: np.dtype("uint8"),
TypeId.UINT16: np.dtype("uint16"),
TypeId.UINT32: np.dtype("uint32"),
TypeId.UINT64: np.dtype("uint64"),
TypeId.FLOAT32: np.dtype("float32"),
TypeId.FLOAT64: np.dtype("float64"),
TypeId.BOOL8: np.dtype("bool"),
TypeId.TIMESTAMP_SECONDS: np.dtype("datetime64[s]"),
TypeId.TIMESTAMP_MILLISECONDS: np.dtype("datetime64[ms]"),
TypeId.TIMESTAMP_MICROSECONDS: np.dtype("datetime64[us]"),
TypeId.TIMESTAMP_NANOSECONDS: np.dtype("datetime64[ns]"),
TypeId.DURATION_SECONDS: np.dtype("timedelta64[s]"),
TypeId.DURATION_MILLISECONDS: np.dtype("timedelta64[ms]"),
TypeId.DURATION_MICROSECONDS: np.dtype("timedelta64[us]"),
TypeId.DURATION_NANOSECONDS: np.dtype("timedelta64[ns]"),
TypeId.STRING: np.dtype("object"),
TypeId.STRUCT: np.dtype("object"),
}
55 changes: 51 additions & 4 deletions python/cudf/cudf/_lib/types.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,6 @@ from cudf._lib.types cimport (

import cudf
from cudf._lib import pylibcudf
from cudf._lib.pylibcudf.types import (
LIBCUDF_TO_SUPPORTED_NUMPY_TYPES,
SUPPORTED_NUMPY_TO_LIBCUDF_TYPES,
)

size_type_dtype = np.dtype("int32")

Expand Down Expand Up @@ -79,11 +75,62 @@ class TypeId(IntEnum):
STRUCT = <underlying_type_t_type_id> libcudf_types.type_id.STRUCT


SUPPORTED_NUMPY_TO_LIBCUDF_TYPES = {
np.dtype("int8"): TypeId.INT8,
np.dtype("int16"): TypeId.INT16,
np.dtype("int32"): TypeId.INT32,
np.dtype("int64"): TypeId.INT64,
np.dtype("uint8"): TypeId.UINT8,
np.dtype("uint16"): TypeId.UINT16,
np.dtype("uint32"): TypeId.UINT32,
np.dtype("uint64"): TypeId.UINT64,
np.dtype("float32"): TypeId.FLOAT32,
np.dtype("float64"): TypeId.FLOAT64,
np.dtype("datetime64[s]"): TypeId.TIMESTAMP_SECONDS,
np.dtype("datetime64[ms]"): TypeId.TIMESTAMP_MILLISECONDS,
np.dtype("datetime64[us]"): TypeId.TIMESTAMP_MICROSECONDS,
np.dtype("datetime64[ns]"): TypeId.TIMESTAMP_NANOSECONDS,
np.dtype("object"): TypeId.STRING,
np.dtype("bool"): TypeId.BOOL8,
np.dtype("timedelta64[s]"): TypeId.DURATION_SECONDS,
np.dtype("timedelta64[ms]"): TypeId.DURATION_MILLISECONDS,
np.dtype("timedelta64[us]"): TypeId.DURATION_MICROSECONDS,
np.dtype("timedelta64[ns]"): TypeId.DURATION_NANOSECONDS,
}

SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = {
k: pylibcudf.TypeId(v).value
for k, v in SUPPORTED_NUMPY_TO_LIBCUDF_TYPES.items()
}

LIBCUDF_TO_SUPPORTED_NUMPY_TYPES = {
# There's no equivalent to EMPTY in cudf. We translate EMPTY
# columns from libcudf to ``int8`` columns of all nulls in Python.
# ``int8`` is chosen because it uses the least amount of memory.
TypeId.EMPTY: np.dtype("int8"),
TypeId.INT8: np.dtype("int8"),
TypeId.INT16: np.dtype("int16"),
TypeId.INT32: np.dtype("int32"),
TypeId.INT64: np.dtype("int64"),
TypeId.UINT8: np.dtype("uint8"),
TypeId.UINT16: np.dtype("uint16"),
TypeId.UINT32: np.dtype("uint32"),
TypeId.UINT64: np.dtype("uint64"),
TypeId.FLOAT32: np.dtype("float32"),
TypeId.FLOAT64: np.dtype("float64"),
TypeId.BOOL8: np.dtype("bool"),
TypeId.TIMESTAMP_SECONDS: np.dtype("datetime64[s]"),
TypeId.TIMESTAMP_MILLISECONDS: np.dtype("datetime64[ms]"),
TypeId.TIMESTAMP_MICROSECONDS: np.dtype("datetime64[us]"),
TypeId.TIMESTAMP_NANOSECONDS: np.dtype("datetime64[ns]"),
TypeId.DURATION_SECONDS: np.dtype("timedelta64[s]"),
TypeId.DURATION_MILLISECONDS: np.dtype("timedelta64[ms]"),
TypeId.DURATION_MICROSECONDS: np.dtype("timedelta64[us]"),
TypeId.DURATION_NANOSECONDS: np.dtype("timedelta64[ns]"),
TypeId.STRING: np.dtype("object"),
TypeId.STRUCT: np.dtype("object"),
}

PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES = {
pylibcudf.TypeId(k).value: v
for k, v in LIBCUDF_TO_SUPPORTED_NUMPY_TYPES.items()
Expand Down

0 comments on commit b3e934c

Please sign in to comment.