Support struct-like inputs

shwina · shwina · commit 2f8b7bf5e9bb · 2025-01-07T16:38:01.000-05:00
diff --git a/python/cuda_parallel/cuda/parallel/experimental/_cccl.py b/python/cuda_parallel/cuda/parallel/experimental/_cccl.py
@@ -10,8 +10,9 @@
 import numpy as np
 from numba import cuda, types
 
-from ._utils.cai import DeviceArrayLike, get_dtype, is_contiguous
+from ._utils.cai import get_dtype, is_contiguous
 from .iterators._iterators import IteratorBase
+from .typing import DeviceArrayLike, GpuStruct
 
 
 # MUST match `cccl_type_enum` in c/include/cccl/c/types.h
@@ -121,6 +122,10 @@ def _type_to_enum(numba_type: types.Type) -> TypeEnum:
 def _numba_type_to_info(numba_type: types.Type) -> TypeInfo:
     context = cuda.descriptor.cuda_target.target_context
     value_type = context.get_value_type(numba_type)
+    if isinstance(numba_type, types.Record):
+        # then `value_type` is a pointer and we need the
+        # alignment of the pointee.
+        value_type = value_type.pointee
     size = value_type.get_abi_size(context.target_data)
     alignment = value_type.get_abi_alignment(context.target_data)
     return TypeInfo(size, alignment, _type_to_enum(numba_type))
@@ -209,6 +214,10 @@ def to_cccl_iter(array_or_iterator) -> Iterator:
     return _device_array_to_cccl_iter(array_or_iterator)
 
 
-def host_array_to_value(array: np.ndarray) -> Value:
+def host_array_to_value(array: np.ndarray | GpuStruct) -> Value:
     info = _numpy_type_to_info(array.dtype)
-    return Value(info, array.ctypes.data)
+    if isinstance(array, np.ndarray):
+        data = ctypes.cast(array.ctypes.data, ctypes.c_void_p)
+    else:
+        data = ctypes.cast(ctypes.pointer(array._data), ctypes.c_void_p)
+    return Value(info, data)
diff --git a/python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py b/python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py
@@ -15,7 +15,14 @@
 
 
 def get_dtype(arr: DeviceArrayLike) -> np.dtype:
-    return np.dtype(arr.__cuda_array_interface__["typestr"])
+    typestr = arr.__cuda_array_interface__["typestr"]
+
+    if typestr.startswith("|V"):
+        # it's a structured dtype, use the descr field:
+        return np.dtype(arr.__cuda_array_interface__["descr"])
+    else:
+        # a simple dtype, use the typestr field:
+        return np.dtype(typestr)
 
 
 def get_strides(arr: DeviceArrayLike) -> Optional[Tuple]:
diff --git a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
@@ -18,15 +18,16 @@
 from .._caching import CachableFunction, cache_with_key
 from .._utils import cai
 from ..iterators._iterators import IteratorBase
-from ..typing import DeviceArrayLike
+from ..typing import DeviceArrayLike, GpuStruct
 
 
 class _Op:
-    def __init__(self, dtype: np.dtype, op: Callable):
-        value_type = numba.from_dtype(dtype)
-        self.ltoir, _ = cuda.compile(
-            op, sig=value_type(value_type, value_type), output="ltoir"
-        )
+    def __init__(self, h_init: np.ndarray | GpuStruct, op: Callable):
+        if isinstance(h_init, np.ndarray):
+            value_type = numba.from_dtype(h_init.dtype)
+        else:
+            value_type = numba.typeof(h_init)
+        self.ltoir, _ = cuda.compile(op, sig=(value_type, value_type), output="ltoir")
         self.name = op.__name__.encode("utf-8")
 
     def handle(self) -> cccl.Op:
@@ -53,7 +54,7 @@ def __init__(
         d_in: DeviceArrayLike | IteratorBase,
         d_out: DeviceArrayLike,
         op: Callable,
-        h_init: np.ndarray,
+        h_init: np.ndarray | GpuStruct,
     ):
         d_in_cccl = cccl.to_cccl_iter(d_in)
         self._ctor_d_in_cccl_type_enum_name = cccl.type_enum_as_name(
@@ -64,11 +65,10 @@ def __init__(
         cc_major, cc_minor = cuda.get_current_device().compute_capability
         cub_path, thrust_path, libcudacxx_path, cuda_include_path = get_paths()
         bindings = get_bindings()
-        self.op_wrapper = _Op(h_init.dtype, op)
+        self.op_wrapper = _Op(h_init, op)
         d_out_cccl = cccl.to_cccl_iter(d_out)
         self.build_result = cccl.DeviceReduceBuildResult()
 
-        # TODO Figure out caching
         error = bindings.cccl_device_reduce_build(
             ctypes.byref(self.build_result),
             d_in_cccl,
@@ -85,7 +85,9 @@ def __init__(
         if error != enums.CUDA_SUCCESS:
             raise ValueError("Error building reduce")
 
-    def __call__(self, temp_storage, d_in, d_out, num_items: int, h_init: np.ndarray):
+    def __call__(
+        self, temp_storage, d_in, d_out, num_items: int, h_init: np.ndarray | GpuStruct
+    ):
         d_in_cccl = cccl.to_cccl_iter(d_in)
         if d_in_cccl.type.value == cccl.IteratorKind.ITERATOR:
             assert num_items is not None
@@ -99,7 +101,7 @@ def __call__(self, temp_storage, d_in, d_out, num_items: int, h_init: np.ndarray
             self._ctor_d_in_cccl_type_enum_name,
             cccl.type_enum_as_name(d_in_cccl.value_type.type.value),
         )
-        _dtype_validation(self._ctor_d_out_dtype, d_out.dtype)
+        _dtype_validation(self._ctor_d_out_dtype, cai.get_dtype(d_out))
         _dtype_validation(self._ctor_init_dtype, h_init.dtype)
         bindings = get_bindings()
         if temp_storage is None:
diff --git a/python/cuda_parallel/cuda/parallel/experimental/gpu_struct.py b/python/cuda_parallel/cuda/parallel/experimental/gpu_struct.py
@@ -0,0 +1,120 @@
+from dataclasses import dataclass
+from dataclasses import fields as dataclass_fields
+
+import numba
+import numpy as np
+from numba.core import cgutils
+from numba.core.extending import (
+    make_attribute_wrapper,
+    models,
+    register_model,
+    typeof_impl,
+)
+from numba.core.typing import signature as nb_signature
+from numba.core.typing.templates import AttributeTemplate, ConcreteTemplate
+from numba.cuda.cudadecl import registry as cuda_registry
+from numba.extending import lower_builtin
+
+from .typing import GpuStruct
+
+
+def gpu_struct(this: type) -> GpuStruct:
+    """
+    Define the decorated class as being a GpuStruct.
+
+    A GpuStruct is class that defines fields, similar to a dataclass.
+    Arrays of GpuStruct objects can be used as inputs to algorithms.
+    The type of each field must be a subclass of `np.number`, e.g.,
+    `np.int32` or `np.float64`.
+
+    Example:
+        The code snippet below shows how to use `gpu_struct` to define
+        a `Pixel` type (composed of `r`, `g` and `b` values),  and perform
+        a reduction on an array of `Pixel` objects to identify the one
+        with the largest `g` component:
+
+        .. literalinclude:: ../../python/cuda_parallel/tests/test_reduce_api.py
+            :language: python
+            :dedent:
+            :start-after: example-begin reduce-struct
+            :end-before: example-end reduce-struct
+    """
+
+    anns = getattr(this, "__annotations__", {})
+
+    # set the .dtype attribute on the class for numpy compatibility:
+    setattr(this, "dtype", np.dtype(list(anns.items())))
+
+    # define __post_init__ to create a ctypes object from the fields,
+    # and keep a reference to it in the `._data` attribute.
+    def __post_init__(self):
+        ctypes_typ = np.ctypeslib.as_ctypes_type(this.dtype)
+        self._data = ctypes_typ(*(getattr(self, name) for name in this.dtype.names))
+
+    setattr(this, "__post_init__", __post_init__)
+
+    # create a dataclass:
+    this = dataclass(this)
+    fields = dataclass_fields(this)
+
+    # define a numba type corresponding to the dataclass:
+    class ThisType(numba.types.Type):
+        def __init__(self):
+            super().__init__(name=this.__name__)
+
+    this_type = ThisType()
+
+    @typeof_impl.register(this)
+    def typeof_this(val, c):
+        return ThisType()
+
+    # Data model corresponding to ThisType:
+    @register_model(ThisType)
+    class ThisModel(models.StructModel):
+        def __init__(self, dmm, fe_type):
+            members = [(field.name, numba.from_dtype(field.type)) for field in fields]
+            super().__init__(dmm, fe_type, members)
+
+    # Typing for accessing attributes (fields) of the dataclass:
+    class ThisAttrsTemplate(AttributeTemplate):
+        pass
+
+    for field in fields:
+        typ = field.type
+        name = field.name
+
+        def resolver(self, this):
+            return numba.from_dtype(typ)
+
+        setattr(ThisAttrsTemplate, f"resolve_{name}", resolver)
+
+    @cuda_registry.register_attr
+    class ThisAttrs(ThisAttrsTemplate):
+        key = this_type
+
+    # Lowering for attribute access:
+    for field in fields:
+        make_attribute_wrapper(ThisType, field.name, field.name)
+
+    # Register typing for constructor.
+    @cuda_registry.register
+    class TypeConstructor(ConcreteTemplate):
+        key = this
+        cases = [
+            nb_signature(this_type, *[numba.from_dtype(field.type) for field in fields])
+        ]
+
+    cuda_registry.register_global(this, numba.types.Function(TypeConstructor))
+
+    def type_constructor(context, builder, sig, args):
+        ty = sig.return_type
+        retval = cgutils.create_struct_proxy(ty)(context, builder)
+        for field, val in zip(fields, args):
+            setattr(retval, field.name, val)
+        return retval._getvalue()
+
+    lower_builtin(this, *[numba.from_dtype(field.type) for field in fields])(
+        type_constructor
+    )
+
+    return this
diff --git a/python/cuda_parallel/cuda/parallel/experimental/typing.py b/python/cuda_parallel/cuda/parallel/experimental/typing.py
@@ -1,3 +1,5 @@
+from typing import Any
+
 from typing_extensions import (
     Protocol,
 )  # TODO: typing_extensions required for Python 3.7 docs env
@@ -10,3 +12,7 @@ class DeviceArrayLike(Protocol):
     """
 
     __cuda_array_interface__: dict
+
+
+# return type of @gpu_struct
+GpuStruct = Any
diff --git a/python/cuda_parallel/tests/test_reduce.py b/python/cuda_parallel/tests/test_reduce.py
@@ -12,6 +12,7 @@
 
 import cuda.parallel.experimental.algorithms as algorithms
 import cuda.parallel.experimental.iterators as iterators
+from cuda.parallel.experimental.gpu_struct import gpu_struct
 
 
 def random_int(shape, dtype):
@@ -550,3 +551,30 @@ def binary_op(x, y):
     d_in = cp.zeros(size)[::2]
     with pytest.raises(ValueError, match="Non-contiguous arrays are not supported."):
         _ = algorithms.reduce_into(d_in, d_out, binary_op, h_init)
+
+
+def test_reduce_struct_type():
+    @gpu_struct
+    class Pixel:
+        r: np.int32
+        g: np.int32
+        b: np.int32
+
+    def max_g_value(x, y):
+        return x if x.g > y.g else y
+
+    d_rgb = cp.random.randint(0, 256, (10, 3), dtype=np.int32).view(Pixel.dtype)
+    d_out = cp.zeros(1, Pixel.dtype)
+
+    h_init = Pixel(0, 0, 0)
+
+    reduce_into = algorithms.reduce_into(d_rgb, d_out, max_g_value, h_init)
+    temp_storage_bytes = reduce_into(None, d_rgb, d_out, len(d_rgb), h_init)
+
+    d_temp_storage = cp.zeros(temp_storage_bytes, dtype=np.uint8)
+    _ = reduce_into(d_temp_storage, d_rgb, d_out, len(d_rgb), h_init)
+
+    h_rgb = d_rgb.get()
+    expected = h_rgb[h_rgb.view("int32")[:, 1].argmax()]
+
+    np.testing.assert_equal(expected["g"], d_out.get()["g"])
diff --git a/python/cuda_parallel/tests/test_reduce_api.py b/python/cuda_parallel/tests/test_reduce_api.py
@@ -37,3 +37,34 @@ def min_op(a, b):
     expected_output = 0
     assert (d_output == expected_output).all()
     # example-end reduce-min
+
+
+def test_reduce_struct_type():
+    from cuda.parallel.experimental.gpu_struct import gpu_struct
+
+    # example-begin reduce-struct
+    @gpu_struct
+    class Pixel:
+        r: np.int32
+        g: np.int32
+        b: np.int32
+
+    def max_g_value(x, y):
+        return x if x.g > y.g else y
+
+    d_rgb = cp.random.randint(0, 256, (10, 3), dtype=np.int32).view(Pixel.dtype)
+    d_out = cp.zeros(1, Pixel.dtype)
+
+    h_init = Pixel(0, 0, 0)
+
+    reduce_into = algorithms.reduce_into(d_rgb, d_out, max_g_value, h_init)
+    temp_storage_bytes = reduce_into(None, d_rgb, d_out, len(d_rgb), h_init)
+
+    d_temp_storage = cp.zeros(temp_storage_bytes, dtype=np.uint8)
+    _ = reduce_into(d_temp_storage, d_rgb, d_out, len(d_rgb), h_init)
+
+    h_rgb = d_rgb.get()
+    expected = h_rgb[h_rgb.view("int32")[:, 1].argmax()]
+
+    np.testing.assert_equal(expected["g"], d_out.get()["g"])
+    # example-end reduce-struct