Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add docs, tests, and samples for StridedMemoryView/@args_viewable_as_strided_memory #247

Merged
merged 19 commits into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cuda_core/cuda/core/experimental/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@
from cuda.core.experimental._launcher import LaunchConfig, launch
from cuda.core.experimental._program import Program
from cuda.core.experimental._stream import Stream, StreamOptions
from cuda.core.experimental import utils
83 changes: 81 additions & 2 deletions cuda_core/cuda/core/experimental/_memoryview.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,65 @@ from cuda.core.experimental._utils import handle_return

@cython.dataclasses.dataclass
cdef class StridedMemoryView:

"""A dataclass holding metadata of a strided dense array/tensor.

A :obj:`StridedMemoryView` instance can be created in two ways:

1. Using the :obj:`viewable` decorator (recommended)
2. Explicit construction, see below

This object supports both DLPack (up to v1.0) and CUDA Array Interface
(CAI) v3. When wrapping an arbitrary object it will try the DLPack protocol
first, then the CAI protocol. A :obj:`BufferError` is raised if neither is
supported.

Since either way would take a consumer stream, for DLPack it is passed to
``obj.__dlpack__()`` as-is (except for :obj:`None`, see below); for CAI, a
stream order will be established between the consumer stream and the
producer stream (from ``obj.__cuda_array_interface__()["stream"]``), as if
``cudaStreamWaitEvent`` is called by this method.

To opt-out of the stream ordering operation in either DLPack or CAI,
please pass ``stream_ptr=-1``. Note that this deviates (on purpose)
from the semantics of ``obj.__dlpack__(stream=None, ...)`` since ``cuda.core``
does not encourage using the (legacy) default/null stream, but is
consistent with the CAI's semantics. For DLPack, ``stream=-1`` will be
internally passed to ``obj.__dlpack__()`` instead.

Attributes
----------
ptr : int
Pointer to the tensor buffer (as a Python `int`).
shape: tuple
Shape of the tensor.
strides: tuple
Strides of the tensor (in **counts**, not bytes).
dtype: numpy.dtype
Data type of the tensor.
device_id: int
The device ID for where the tensor is located. It is 0 for CPU tensors.
device_accessible: bool
Whether the tensor data can be accessed on the GPU.
readonly: bool
Whether the tensor data can be modified in place.
exporting_obj: Any
A reference to the original tensor object that is being viewed.

Parameters
----------
obj : Any
Any objects that supports either DLPack (up to v1.0) or CUDA Array
Interface (v3).
stream_ptr: int
The pointer address (as Python `int`) to the **consumer** stream.
Stream ordering will be properly established unless ``-1`` is passed.
"""
# TODO: switch to use Cython's cdef typing?
ptr: int = None
shape: tuple = None
strides: tuple = None # in counts, not bytes
dtype: numpy.dtype = None
device_id: int = None # -1 for CPU
device_id: int = None # 0 for CPU
leofang marked this conversation as resolved.
Show resolved Hide resolved
device_accessible: bool = None
readonly: bool = None
exporting_obj: Any = None
Expand Down Expand Up @@ -285,6 +337,33 @@ cdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None):


def viewable(tuple arg_indices):
"""Decorator to create proxy objects to :obj:`StridedMemoryView` for the
specified positional arguments.

Inside the decorated function, the specified arguments becomes instances
leofang marked this conversation as resolved.
Show resolved Hide resolved
of an (undocumented) proxy type, regardless of its original source. A
:obj:`StridedMemoryView` instance can be obtained by passing the (consumer)
stream pointer (as a Python `int`) to the proxies's ``view()`` method. For
example:

.. code-block:: python

@viewable((1,))
def my_func(arg0, arg1, arg2, stream: Stream):
# arg1 can be any object supporting DLPack or CUDA Array Interface
view = arg1.view(stream.handle)
assert isinstance(view, StridedMemoryView)
...

This allows array/tensor attributes to be accessed inside the function
implementation, while keeping the function body array-library-agnostic (if
desired).
leofang marked this conversation as resolved.
Show resolved Hide resolved

Parameters
----------
arg_indices : tuple
The indices of the target positional arguments.
"""
def wrapped_func_with_indices(func):
@functools.wraps(func)
def wrapped_func(*args, **kwargs):
Expand Down
15 changes: 15 additions & 0 deletions cuda_core/docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,18 @@ CUDA compilation toolchain
:toctree: generated/

Program


.. module:: cuda.core.experimental.utils

Utility functions
-----------------

.. autosummary::
:toctree: generated/

viewable

:template: dataclass.rst

StridedMemoryView
9 changes: 9 additions & 0 deletions cuda_core/docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
'sphinx.ext.autodoc',
'sphinx.ext.autosummary',
'sphinx.ext.napoleon',
'sphinx.ext.intersphinx',
'myst_nb',
'enum_tools.autoenum',
'sphinx_copybutton',
Expand Down Expand Up @@ -81,3 +82,11 @@

# skip cmdline prompts
copybutton_exclude = '.linenos, .gp'

intersphinx_mapping = {
'python': ('https://docs.python.org/3/', None),
'numpy': ('https://numpy.org/doc/stable/', None),
}

napoleon_google_docstring = False
napoleon_numpy_docstring = True
137 changes: 137 additions & 0 deletions cuda_core/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
#
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

try:
import cupy as cp
except ImportError:
cp = None
try:
from numba import cuda as numba_cuda
except ImportError:
numba_cuda = None
import numpy as np
import pytest

from cuda.core.experimental import Device
from cuda.core.experimental.utils import StridedMemoryView, viewable
leofang marked this conversation as resolved.
Show resolved Hide resolved


def convert_strides_to_counts(strides, itemsize):
return tuple(s // itemsize for s in strides)


@pytest.mark.parametrize(
"in_arr,", (
np.empty(3, dtype=np.int32),
np.empty((6, 6), dtype=np.float64)[::2, ::2],
np.empty((3, 4), order='F'),
)
)
class TestViewCPU:

def test_viewable_cpu(self, in_arr):

@viewable((0,))
def my_func(arr):
# stream_ptr=-1 means "the consumer does not care"
view = arr.view(-1)
self._check_view(view, in_arr)

my_func(in_arr)

def test_strided_memory_view_cpu(self, in_arr):
# stream_ptr=-1 means "the consumer does not care"
view = StridedMemoryView(in_arr, stream_ptr=-1)
self._check_view(view, in_arr)

def _check_view(self, view, in_arr):
leofang marked this conversation as resolved.
Show resolved Hide resolved
assert isinstance(view, StridedMemoryView)
assert view.ptr == in_arr.ctypes.data
assert view.shape == in_arr.shape
strides_in_counts = convert_strides_to_counts(
in_arr.strides, in_arr.dtype.itemsize)
if in_arr.flags.c_contiguous:
assert view.strides is None
else:
assert view.strides == strides_in_counts
assert view.dtype == in_arr.dtype
assert view.device_id == 0
assert view.device_accessible == False
assert view.exporting_obj is in_arr


def gpu_array_samples():
# TODO: this function would initialize the device at test collection time
samples = []
if cp is not None:
samples += [
(cp.empty(3, dtype=cp.complex64), None),
(cp.empty((6, 6), dtype=cp.float64)[::2, ::2], True),
(cp.empty((3, 4), order='F'), True),
]
# Numba's device_array is the only known array container that does not
# support DLPack (so that we get to test the CAI coverage).
if numba_cuda is not None:
samples += [
(numba_cuda.device_array((2,), dtype=np.int8), None),
(numba_cuda.device_array((4, 2), dtype=np.float32), True),
]
return samples


def gpu_array_ptr(arr):
if cp is not None and isinstance(arr, cp.ndarray):
return arr.data.ptr
if numba_cuda is not None and isinstance(arr, numba_cuda.cudadrv.devicearray.DeviceNDArray):
return arr.device_ctypes_pointer.value
assert False, f"{arr=}"


@pytest.mark.parametrize(
"in_arr,stream", (
*gpu_array_samples(),
)
)
class TestViewGPU:

def test_viewable_gpu(self, in_arr, stream):
# TODO: use the device fixture?
dev = Device()
dev.set_current()
# This is the consumer stream
s = dev.create_stream() if stream else None

@viewable((0,))
def my_func(arr):
view = arr.view(s.handle if s else -1)
self._check_view(view, in_arr, dev)

my_func(in_arr)

def test_strided_memory_view_cpu(self, in_arr, stream):
# TODO: use the device fixture?
dev = Device()
dev.set_current()
# This is the consumer stream
s = dev.create_stream() if stream else None

view = StridedMemoryView(
in_arr,
stream_ptr=s.handle if s else -1)
self._check_view(view, in_arr, dev)

def _check_view(self, view, in_arr, dev):
leofang marked this conversation as resolved.
Show resolved Hide resolved
assert isinstance(view, StridedMemoryView)
assert view.ptr == gpu_array_ptr(in_arr)
assert view.shape == in_arr.shape
strides_in_counts = convert_strides_to_counts(
in_arr.strides, in_arr.dtype.itemsize)
if in_arr.flags["C_CONTIGUOUS"]:
assert view.strides in (None, strides_in_counts)
else:
assert view.strides == strides_in_counts
assert view.dtype == in_arr.dtype
assert view.device_id == dev.device_id
assert view.device_accessible == True
assert view.exporting_obj is in_arr