Skip to content

Commit

Permalink
Merge pull request #247 from leofang/strided_memory_view
Browse files Browse the repository at this point in the history
Add docs, tests, and samples for `StridedMemoryView`/`@args_viewable_as_strided_memory`
  • Loading branch information
leofang authored Dec 4, 2024
2 parents c3077da + 16fc9f6 commit a725723
Show file tree
Hide file tree
Showing 12 changed files with 508 additions and 53 deletions.
6 changes: 6 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1 +1,7 @@
cuda/_version.py export-subst

* text eol=lf

# we do not own any headers checked in, don't touch them
*.h binary
*.hpp binary
1 change: 1 addition & 0 deletions cuda_core/cuda/core/experimental/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

from cuda.core.experimental import utils
from cuda.core.experimental._device import Device
from cuda.core.experimental._event import EventOptions
from cuda.core.experimental._launcher import LaunchConfig, launch
Expand Down
111 changes: 94 additions & 17 deletions cuda_core/cuda/core/experimental/_memoryview.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,67 @@ from cuda.core.experimental._utils import handle_return

@cython.dataclasses.dataclass
cdef class StridedMemoryView:

"""A dataclass holding metadata of a strided dense array/tensor.
A :obj:`StridedMemoryView` instance can be created in two ways:
1. Using the :obj:`args_viewable_as_strided_memory` decorator (recommended)
2. Explicit construction, see below
This object supports both DLPack (up to v1.0) and CUDA Array Interface
(CAI) v3. When wrapping an arbitrary object it will try the DLPack protocol
first, then the CAI protocol. A :obj:`BufferError` is raised if neither is
supported.
Since either way would take a consumer stream, for DLPack it is passed to
``obj.__dlpack__()`` as-is (except for :obj:`None`, see below); for CAI, a
stream order will be established between the consumer stream and the
producer stream (from ``obj.__cuda_array_interface__()["stream"]``), as if
``cudaStreamWaitEvent`` is called by this method.
To opt-out of the stream ordering operation in either DLPack or CAI,
please pass ``stream_ptr=-1``. Note that this deviates (on purpose)
from the semantics of ``obj.__dlpack__(stream=None, ...)`` since ``cuda.core``
does not encourage using the (legacy) default/null stream, but is
consistent with the CAI's semantics. For DLPack, ``stream=-1`` will be
internally passed to ``obj.__dlpack__()`` instead.
Attributes
----------
ptr : int
Pointer to the tensor buffer (as a Python `int`).
shape: tuple
Shape of the tensor.
strides: tuple
Strides of the tensor (in **counts**, not bytes).
dtype: numpy.dtype
Data type of the tensor.
device_id: int
The device ID for where the tensor is located. It is -1 for CPU tensors
(meaning those only accessible from the host).
is_device_accessible: bool
Whether the tensor data can be accessed on the GPU.
readonly: bool
Whether the tensor data can be modified in place.
exporting_obj: Any
A reference to the original tensor object that is being viewed.
Parameters
----------
obj : Any
Any objects that supports either DLPack (up to v1.0) or CUDA Array
Interface (v3).
stream_ptr: int
The pointer address (as Python `int`) to the **consumer** stream.
Stream ordering will be properly established unless ``-1`` is passed.
"""
# TODO: switch to use Cython's cdef typing?
ptr: int = None
shape: tuple = None
strides: tuple = None # in counts, not bytes
dtype: numpy.dtype = None
device_id: int = None # -1 for CPU
device_accessible: bool = None
is_device_accessible: bool = None
readonly: bool = None
exporting_obj: Any = None

Expand All @@ -48,7 +101,7 @@ cdef class StridedMemoryView:
+ f" strides={self.strides},\n"
+ f" dtype={get_simple_repr(self.dtype)},\n"
+ f" device_id={self.device_id},\n"
+ f" device_accessible={self.device_accessible},\n"
+ f" is_device_accessible={self.is_device_accessible},\n"
+ f" readonly={self.readonly},\n"
+ f" exporting_obj={get_simple_repr(self.exporting_obj)})")

Expand Down Expand Up @@ -99,28 +152,25 @@ cdef class _StridedMemoryViewProxy:

cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None):
cdef int dldevice, device_id, i
cdef bint device_accessible, versioned, is_readonly
cdef bint is_device_accessible, versioned, is_readonly
is_device_accessible = False
dldevice, device_id = obj.__dlpack_device__()
if dldevice == _kDLCPU:
device_accessible = False
assert device_id == 0
device_id = -1
if stream_ptr is None:
raise BufferError("stream=None is ambiguous with view()")
elif stream_ptr == -1:
stream_ptr = None
elif dldevice == _kDLCUDA:
device_accessible = True
assert device_id >= 0
is_device_accessible = True
# no need to check other stream values, it's a pass-through
if stream_ptr is None:
raise BufferError("stream=None is ambiguous with view()")
elif dldevice == _kDLCUDAHost:
device_accessible = True
assert device_id == 0
# just do a pass-through without any checks, as pinned memory can be
# accessed on both host and device
elif dldevice == _kDLCUDAManaged:
device_accessible = True
# just do a pass-through without any checks, as managed memory can be
elif dldevice in (_kDLCUDAHost, _kDLCUDAManaged):
is_device_accessible = True
# just do a pass-through without any checks, as pinned/managed memory can be
# accessed on both host and device
else:
raise BufferError("device not supported")
Expand Down Expand Up @@ -171,7 +221,7 @@ cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None):
buf.strides = None
buf.dtype = dtype_dlpack_to_numpy(&dl_tensor.dtype)
buf.device_id = device_id
buf.device_accessible = device_accessible
buf.is_device_accessible = is_device_accessible
buf.readonly = is_readonly
buf.exporting_obj = obj

Expand Down Expand Up @@ -261,7 +311,7 @@ cdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None):
if buf.strides is not None:
# convert to counts
buf.strides = tuple(s // buf.dtype.itemsize for s in buf.strides)
buf.device_accessible = True
buf.is_device_accessible = True
buf.device_id = handle_return(
cuda.cuPointerGetAttribute(
cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
Expand All @@ -284,7 +334,34 @@ cdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None):
return buf


def viewable(tuple arg_indices):
def args_viewable_as_strided_memory(tuple arg_indices):
"""Decorator to create proxy objects to :obj:`StridedMemoryView` for the
specified positional arguments.
This allows array/tensor attributes to be accessed inside the function
implementation, while keeping the function body array-library-agnostic (if
desired).
Inside the decorated function, the specified arguments become instances
of an (undocumented) proxy type, regardless of its original source. A
:obj:`StridedMemoryView` instance can be obtained by passing the (consumer)
stream pointer (as a Python `int`) to the proxies's ``view()`` method. For
example:
.. code-block:: python
@args_viewable_as_strided_memory((1,))
def my_func(arg0, arg1, arg2, stream: Stream):
# arg1 can be any object supporting DLPack or CUDA Array Interface
view = arg1.view(stream.handle)
assert isinstance(view, StridedMemoryView)
...
Parameters
----------
arg_indices : tuple
The indices of the target positional arguments.
"""
def wrapped_func_with_indices(func):
@functools.wraps(func)
def wrapped_func(*args, **kwargs):
Expand Down
4 changes: 4 additions & 0 deletions cuda_core/cuda/core/experimental/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,7 @@
#
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

from cuda.core.experimental._memoryview import (
StridedMemoryView, # noqa: F401
args_viewable_as_strided_memory, # noqa: F401
)
15 changes: 15 additions & 0 deletions cuda_core/docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,18 @@ CUDA compilation toolchain
:toctree: generated/

Program


.. module:: cuda.core.experimental.utils

Utility functions
-----------------

.. autosummary::
:toctree: generated/

args_viewable_as_strided_memory

:template: dataclass.rst

StridedMemoryView
9 changes: 9 additions & 0 deletions cuda_core/docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
"sphinx.ext.autodoc",
"sphinx.ext.autosummary",
"sphinx.ext.napoleon",
"sphinx.ext.intersphinx",
"myst_nb",
"enum_tools.autoenum",
"sphinx_copybutton",
Expand Down Expand Up @@ -82,3 +83,11 @@

# skip cmdline prompts
copybutton_exclude = ".linenos, .gp"

intersphinx_mapping = {
"python": ("https://docs.python.org/3/", None),
"numpy": ("https://numpy.org/doc/stable/", None),
}

napoleon_google_docstring = False
napoleon_numpy_docstring = True
1 change: 1 addition & 0 deletions cuda_core/docs/source/release.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@
maxdepth: 3
---
0.1.1 <release/0.1.1-notes>
0.1.0 <release/0.1.0-notes>
```
13 changes: 13 additions & 0 deletions cuda_core/docs/source/release/0.1.1-notes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# `cuda.core` Release notes

Released on Dec XX, 2024

## Hightlights
- Add `StridedMemoryView` and `@args_viewable_as_strided_memory` that provide a concrete
implementation of DLPack & CUDA Array Interface supports.


## Limitations

- All APIs are currently *experimental* and subject to change without deprecation notice.
Please kindly share your feedbacks with us so that we can make `cuda.core` better!
1 change: 1 addition & 0 deletions cuda_core/docs/versions.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
{
"latest" : "latest",
"0.1.1" : "0.1.1",
"0.1.0" : "0.1.0"
}
Loading

0 comments on commit a725723

Please sign in to comment.