Skip to content

Commit

Permalink
Merge branch 'main' into documentation-remove-gpu-dependency
Browse files Browse the repository at this point in the history
  • Loading branch information
ksimpson-work authored Jan 16, 2025
2 parents 8202536 + f0c304f commit 6ca8eda
Show file tree
Hide file tree
Showing 12 changed files with 119 additions and 124 deletions.
4 changes: 2 additions & 2 deletions cuda_core/cuda/core/experimental/_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from dataclasses import dataclass

from cuda import cuda
from cuda.core.experimental._utils import driver


@dataclass
Expand All @@ -20,7 +20,7 @@ def __init__(self):

@staticmethod
def _from_ctx(obj, dev_id):
assert isinstance(obj, cuda.CUcontext)
assert isinstance(obj, driver.CUcontext)
ctx = Context.__new__(Context)
ctx._handle = obj
ctx._id = dev_id
Expand Down
47 changes: 23 additions & 24 deletions cuda_core/cuda/core/experimental/_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@
import threading
from typing import Union

from cuda import cuda, cudart
from cuda.core.experimental._context import Context, ContextOptions
from cuda.core.experimental._memory import Buffer, MemoryResource, _DefaultAsyncMempool, _SynchronousMemoryResource
from cuda.core.experimental._stream import Stream, StreamOptions, default_stream
from cuda.core.experimental._utils import ComputeCapability, CUDAError, handle_return, precondition
from cuda.core.experimental._utils import ComputeCapability, CUDAError, driver, handle_return, precondition, runtime

_tls = threading.local()
_tls_lock = threading.Lock()
Expand Down Expand Up @@ -47,17 +46,17 @@ class Device:
def __new__(cls, device_id=None):
# important: creating a Device instance does not initialize the GPU!
if device_id is None:
device_id = handle_return(cudart.cudaGetDevice())
device_id = handle_return(runtime.cudaGetDevice())
assert isinstance(device_id, int), f"{device_id=}"
else:
total = handle_return(cudart.cudaGetDeviceCount())
total = handle_return(runtime.cudaGetDeviceCount())
if not isinstance(device_id, int) or not (0 <= device_id < total):
raise ValueError(f"device_id must be within [0, {total}), got {device_id}")

# ensure Device is singleton
with _tls_lock:
if not hasattr(_tls, "devices"):
total = handle_return(cudart.cudaGetDeviceCount())
total = handle_return(runtime.cudaGetDeviceCount())
_tls.devices = []
for dev_id in range(total):
dev = super().__new__(cls)
Expand All @@ -66,7 +65,7 @@ def __new__(cls, device_id=None):
# use the SynchronousMemoryResource which does not use memory pools.
if (
handle_return(
cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0)
runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0)
)
) == 1:
dev._mr = _DefaultAsyncMempool(dev_id)
Expand All @@ -90,7 +89,7 @@ def device_id(self) -> int:
@property
def pci_bus_id(self) -> str:
"""Return a PCI Bus Id string for this device."""
bus_id = handle_return(cudart.cudaDeviceGetPCIBusId(13, self._id))
bus_id = handle_return(runtime.cudaDeviceGetPCIBusId(13, self._id))
return bus_id[:12].decode()

@property
Expand All @@ -107,11 +106,11 @@ def uuid(self) -> str:
driver is older than CUDA 11.4.
"""
driver_ver = handle_return(cuda.cuDriverGetVersion())
driver_ver = handle_return(driver.cuDriverGetVersion())
if driver_ver >= 11040:
uuid = handle_return(cuda.cuDeviceGetUuid_v2(self._id))
uuid = handle_return(driver.cuDeviceGetUuid_v2(self._id))
else:
uuid = handle_return(cuda.cuDeviceGetUuid(self._id))
uuid = handle_return(driver.cuDeviceGetUuid(self._id))
uuid = uuid.bytes.hex()
# 8-4-4-4-12
return f"{uuid[:8]}-{uuid[8:12]}-{uuid[12:16]}-{uuid[16:20]}-{uuid[20:]}"
Expand All @@ -120,24 +119,24 @@ def uuid(self) -> str:
def name(self) -> str:
"""Return the device name."""
# Use 256 characters to be consistent with CUDA Runtime
name = handle_return(cuda.cuDeviceGetName(256, self._id))
name = handle_return(driver.cuDeviceGetName(256, self._id))
name = name.split(b"\0")[0]
return name.decode()

@property
def properties(self) -> dict:
"""Return information about the compute-device."""
# TODO: pythonize the key names
return handle_return(cudart.cudaGetDeviceProperties(self._id))
return handle_return(runtime.cudaGetDeviceProperties(self._id))

@property
def compute_capability(self) -> ComputeCapability:
"""Return a named tuple with 2 fields: major and minor."""
major = handle_return(
cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id)
runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id)
)
minor = handle_return(
cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, self._id)
runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, self._id)
)
return ComputeCapability(major, minor)

Expand All @@ -151,7 +150,7 @@ def context(self) -> Context:
Device must be initialized.
"""
ctx = handle_return(cuda.cuCtxGetCurrent())
ctx = handle_return(driver.cuCtxGetCurrent())
assert int(ctx) != 0
return Context._from_ctx(ctx, self._id)

Expand Down Expand Up @@ -224,23 +223,23 @@ def set_current(self, ctx: Context = None) -> Union[Context, None]:
"the provided context was created on a different "
f"device {ctx._id} other than the target {self._id}"
)
prev_ctx = handle_return(cuda.cuCtxPopCurrent())
handle_return(cuda.cuCtxPushCurrent(ctx._handle))
prev_ctx = handle_return(driver.cuCtxPopCurrent())
handle_return(driver.cuCtxPushCurrent(ctx._handle))
self._has_inited = True
if int(prev_ctx) != 0:
return Context._from_ctx(prev_ctx, self._id)
else:
ctx = handle_return(cuda.cuCtxGetCurrent())
ctx = handle_return(driver.cuCtxGetCurrent())
if int(ctx) == 0:
# use primary ctx
ctx = handle_return(cuda.cuDevicePrimaryCtxRetain(self._id))
handle_return(cuda.cuCtxPushCurrent(ctx))
ctx = handle_return(driver.cuDevicePrimaryCtxRetain(self._id))
handle_return(driver.cuCtxPushCurrent(ctx))
else:
ctx_id = handle_return(cuda.cuCtxGetDevice())
ctx_id = handle_return(driver.cuCtxGetDevice())
if ctx_id != self._id:
# use primary ctx
ctx = handle_return(cuda.cuDevicePrimaryCtxRetain(self._id))
handle_return(cuda.cuCtxPushCurrent(ctx))
ctx = handle_return(driver.cuDevicePrimaryCtxRetain(self._id))
handle_return(driver.cuCtxPushCurrent(ctx))
else:
# no-op, a valid context already exists and is set current
pass
Expand Down Expand Up @@ -337,4 +336,4 @@ def sync(self):
Device must be initialized.
"""
handle_return(cudart.cudaDeviceSynchronize())
handle_return(runtime.cudaDeviceSynchronize())
19 changes: 9 additions & 10 deletions cuda_core/cuda/core/experimental/_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
from dataclasses import dataclass
from typing import Optional

from cuda import cuda
from cuda.core.experimental._utils import CUDAError, check_or_create_options, handle_return
from cuda.core.experimental._utils import CUDAError, check_or_create_options, driver, handle_return


@dataclass
Expand Down Expand Up @@ -60,7 +59,7 @@ def __init__(self, event_obj, handle):

def close(self):
if self.handle is not None:
handle_return(cuda.cuEventDestroy(self.handle))
handle_return(driver.cuEventDestroy(self.handle))
self.handle = None

__slots__ = ("__weakref__", "_mnff", "_timing_disabled", "_busy_waited")
Expand All @@ -80,14 +79,14 @@ def _init(options: Optional[EventOptions] = None):
self._timing_disabled = False
self._busy_waited = False
if not options.enable_timing:
flags |= cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING
flags |= driver.CUevent_flags.CU_EVENT_DISABLE_TIMING
self._timing_disabled = True
if options.busy_waited_sync:
flags |= cuda.CUevent_flags.CU_EVENT_BLOCKING_SYNC
flags |= driver.CUevent_flags.CU_EVENT_BLOCKING_SYNC
self._busy_waited = True
if options.support_ipc:
raise NotImplementedError("TODO")
self._mnff.handle = handle_return(cuda.cuEventCreate(flags))
self._mnff.handle = handle_return(driver.cuEventCreate(flags))
return self

def close(self):
Expand Down Expand Up @@ -119,15 +118,15 @@ def sync(self):
has been completed.
"""
handle_return(cuda.cuEventSynchronize(self._mnff.handle))
handle_return(driver.cuEventSynchronize(self._mnff.handle))

@property
def is_done(self) -> bool:
"""Return True if all captured works have been completed, otherwise False."""
(result,) = cuda.cuEventQuery(self._mnff.handle)
if result == cuda.CUresult.CUDA_SUCCESS:
(result,) = driver.cuEventQuery(self._mnff.handle)
if result == driver.CUresult.CUDA_SUCCESS:
return True
elif result == cuda.CUresult.CUDA_ERROR_NOT_READY:
elif result == driver.CUresult.CUDA_ERROR_NOT_READY:
return False
else:
raise CUDAError(f"unexpected error: {result}")
Expand Down
15 changes: 7 additions & 8 deletions cuda_core/cuda/core/experimental/_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@
from dataclasses import dataclass
from typing import Optional, Union

from cuda import cuda
from cuda.core.experimental._device import Device
from cuda.core.experimental._kernel_arg_handler import ParamHolder
from cuda.core.experimental._module import Kernel
from cuda.core.experimental._stream import Stream
from cuda.core.experimental._utils import CUDAError, check_or_create_options, get_binding_version, handle_return
from cuda.core.experimental._utils import CUDAError, check_or_create_options, driver, get_binding_version, handle_return

# TODO: revisit this treatment for py313t builds
_inited = False
Expand All @@ -25,7 +24,7 @@ def _lazy_init():
global _use_ex
# binding availability depends on cuda-python version
_py_major_minor = get_binding_version()
_driver_ver = handle_return(cuda.cuDriverGetVersion())
_driver_ver = handle_return(driver.cuDriverGetVersion())
_use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8))
_inited = True

Expand Down Expand Up @@ -139,25 +138,25 @@ def launch(kernel, config, *kernel_args):
# mainly to see if the "Ex" API is available and if so we use it, as it's more feature
# rich.
if _use_ex:
drv_cfg = cuda.CUlaunchConfig()
drv_cfg = driver.CUlaunchConfig()
drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid
drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block
drv_cfg.hStream = config.stream.handle
drv_cfg.sharedMemBytes = config.shmem_size
attrs = [] # TODO: support more attributes
if config.cluster:
attr = cuda.CUlaunchAttribute()
attr.id = cuda.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
attr = driver.CUlaunchAttribute()
attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
dim = attr.value.clusterDim
dim.x, dim.y, dim.z = config.cluster
attrs.append(attr)
drv_cfg.numAttrs = len(attrs)
drv_cfg.attrs = attrs
handle_return(cuda.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0))
handle_return(driver.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0))
else:
# TODO: check if config has any unsupported attrs
handle_return(
cuda.cuLaunchKernel(
driver.cuLaunchKernel(
int(kernel._handle), *config.grid, *config.block, config.shmem_size, config.stream._handle, args_ptr, 0
)
)
7 changes: 3 additions & 4 deletions cuda_core/cuda/core/experimental/_linker.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@
from dataclasses import dataclass
from typing import List, Optional

from cuda import cuda
from cuda.core.experimental._module import ObjectCode
from cuda.core.experimental._utils import check_or_create_options, handle_return
from cuda.core.experimental._utils import check_or_create_options, driver, handle_return

# TODO: revisit this treatment for py313t builds
_driver = None # populated if nvJitLink cannot be used
Expand All @@ -29,7 +28,7 @@ def _decide_nvjitlink_or_driver():
if _driver or _nvjitlink:
return

_driver_ver = handle_return(cuda.cuDriverGetVersion())
_driver_ver = handle_return(driver.cuDriverGetVersion())
_driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10)
try:
from cuda.bindings import nvjitlink as _nvjitlink
Expand All @@ -49,7 +48,7 @@ def _decide_nvjitlink_or_driver():
stacklevel=3,
category=RuntimeWarning,
)
_driver = cuda
_driver = driver
return True
else:
return False
Expand Down
21 changes: 10 additions & 11 deletions cuda_core/cuda/core/experimental/_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@
import weakref
from typing import Optional, Tuple, TypeVar

from cuda import cuda
from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
from cuda.core.experimental._stream import default_stream
from cuda.core.experimental._utils import handle_return
from cuda.core.experimental._utils import driver, handle_return

PyCapsule = TypeVar("PyCapsule")

Expand Down Expand Up @@ -141,7 +140,7 @@ def copy_to(self, dst: Buffer = None, *, stream) -> Buffer:
dst = self._mnff.mr.allocate(self._mnff.size, stream)
if dst._mnff.size != self._mnff.size:
raise ValueError("buffer sizes mismatch between src and dst")
handle_return(cuda.cuMemcpyAsync(dst._mnff.ptr, self._mnff.ptr, self._mnff.size, stream.handle))
handle_return(driver.cuMemcpyAsync(dst._mnff.ptr, self._mnff.ptr, self._mnff.size, stream.handle))
return dst

def copy_from(self, src: Buffer, *, stream):
Expand All @@ -160,7 +159,7 @@ def copy_from(self, src: Buffer, *, stream):
raise ValueError("stream must be provided")
if src._mnff.size != self._mnff.size:
raise ValueError("buffer sizes mismatch between src and dst")
handle_return(cuda.cuMemcpyAsync(self._mnff.ptr, src._mnff.ptr, self._mnff.size, stream.handle))
handle_return(driver.cuMemcpyAsync(self._mnff.ptr, src._mnff.ptr, self._mnff.size, stream.handle))

def __dlpack__(
self,
Expand Down Expand Up @@ -243,19 +242,19 @@ class _DefaultAsyncMempool(MemoryResource):
__slots__ = ("_dev_id",)

def __init__(self, dev_id):
self._handle = handle_return(cuda.cuDeviceGetMemPool(dev_id))
self._handle = handle_return(driver.cuDeviceGetMemPool(dev_id))
self._dev_id = dev_id

def allocate(self, size, stream=None) -> Buffer:
if stream is None:
stream = default_stream()
ptr = handle_return(cuda.cuMemAllocFromPoolAsync(size, self._handle, stream.handle))
ptr = handle_return(driver.cuMemAllocFromPoolAsync(size, self._handle, stream.handle))
return Buffer(ptr, size, self)

def deallocate(self, ptr, size, stream=None):
if stream is None:
stream = default_stream()
handle_return(cuda.cuMemFreeAsync(ptr, stream.handle))
handle_return(driver.cuMemFreeAsync(ptr, stream.handle))

@property
def is_device_accessible(self) -> bool:
Expand All @@ -276,11 +275,11 @@ def __init__(self):
self._handle = None

def allocate(self, size, stream=None) -> Buffer:
ptr = handle_return(cuda.cuMemAllocHost(size))
ptr = handle_return(driver.cuMemAllocHost(size))
return Buffer(ptr, size, self)

def deallocate(self, ptr, size, stream=None):
handle_return(cuda.cuMemFreeHost(ptr))
handle_return(driver.cuMemFreeHost(ptr))

@property
def is_device_accessible(self) -> bool:
Expand All @@ -303,14 +302,14 @@ def __init__(self, dev_id):
self._dev_id = dev_id

def allocate(self, size, stream=None) -> Buffer:
ptr = handle_return(cuda.cuMemAlloc(size))
ptr = handle_return(driver.cuMemAlloc(size))
return Buffer(ptr, size, self)

def deallocate(self, ptr, size, stream=None):
if stream is None:
stream = default_stream()
stream.sync()
handle_return(cuda.cuMemFree(ptr))
handle_return(driver.cuMemFree(ptr))

@property
def is_device_accessible(self) -> bool:
Expand Down
Loading

0 comments on commit 6ca8eda

Please sign in to comment.