diff --git a/.gitattributes b/.gitattributes index 00407cdc..aeb32006 100644 --- a/.gitattributes +++ b/.gitattributes @@ -5,3 +5,5 @@ cuda/_version.py export-subst # we do not own any headers checked in, don't touch them *.h binary *.hpp binary +# git should not convert line endings in PNG files +*.png binary diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 431bb7c5..c2d246aa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,12 +1,12 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -repos: - - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.6.4 - hooks: - - id: ruff - args: [--fix, --show-fixes] - - id: ruff-format - -default_language_version: - python: python3 +# Copyright (c) 2024, NVIDIA CORPORATION. + +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.4 + hooks: + - id: ruff + args: [--fix, --show-fixes] + - id: ruff-format + +default_language_version: + python: python3 diff --git a/cuda_bindings/benchmarks/test_launch_latency.py b/cuda_bindings/benchmarks/test_launch_latency.py index 8d70bfe2..f16e971a 100755 --- a/cuda_bindings/benchmarks/test_launch_latency.py +++ b/cuda_bindings/benchmarks/test_launch_latency.py @@ -1,341 +1,341 @@ -# Copyright 2021-2024 NVIDIA Corporation. All rights reserved. -# -# Please refer to the NVIDIA end user license agreement (EULA) associated -# with this source code for terms and conditions that govern your use of -# this software. Any use, reproduction, disclosure, or distribution of -# this software and related documentation outside the terms of the EULA -# is strictly prohibited. -import ctypes - -import pytest - -from cuda import cuda - -from .kernels import kernel_string -from .perf_test_utils import ASSERT_DRV - - -def launch(kernel, stream, args=(), arg_types=()): - cuda.cuLaunchKernel( - kernel, - 1, - 1, - 1, # grid dim - 1, - 1, - 1, # block dim - 0, - stream, # shared mem and stream - (args, arg_types), - 0, - ) # arguments - - -def launch_packed(kernel, stream, params): - cuda.cuLaunchKernel( - kernel, - 1, - 1, - 1, # grid dim - 1, - 1, - 1, # block dim - 0, - stream, # shared mem and stream - params, - 0, - ) # arguments - - -# Measure launch latency with no parmaeters -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_empty_kernel(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"empty_kernel") - ASSERT_DRV(err) - - benchmark(launch, func, stream) - - cuda.cuCtxSynchronize() - - -# Measure launch latency with a single parameter -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel") - ASSERT_DRV(err) - - err, f = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_float)) - ASSERT_DRV(err) - - benchmark(launch, func, stream, args=(f,), arg_types=(None,)) - - cuda.cuCtxSynchronize() - - (err,) = cuda.cuMemFree(f) - ASSERT_DRV(err) - - -# Measure launch latency with many parameters using builtin parameter packing -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_512_args(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args") - ASSERT_DRV(err) - - args = [] - arg_types = [None] * 512 - for _ in arg_types: - err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) - ASSERT_DRV(err) - args.append(p) - - args = tuple(args) - arg_types = tuple(arg_types) - - benchmark(launch, func, stream, args=args, arg_types=arg_types) - - cuda.cuCtxSynchronize() - - for p in args: - (err,) = cuda.cuMemFree(p) - ASSERT_DRV(err) - - -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_512_bools(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_bools") - ASSERT_DRV(err) - - args = [True] * 512 - arg_types = [ctypes.c_bool] * 512 - - args = tuple(args) - arg_types = tuple(arg_types) - - benchmark(launch, func, stream, args=args, arg_types=arg_types) - - cuda.cuCtxSynchronize() - - -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_512_doubles(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_doubles") - ASSERT_DRV(err) - - args = [1.2345] * 512 - arg_types = [ctypes.c_double] * 512 - - args = tuple(args) - arg_types = tuple(arg_types) - - benchmark(launch, func, stream, args=args, arg_types=arg_types) - - cuda.cuCtxSynchronize() - - -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_512_ints(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_ints") - ASSERT_DRV(err) - - args = [123] * 512 - arg_types = [ctypes.c_int] * 512 - - args = tuple(args) - arg_types = tuple(arg_types) - - benchmark(launch, func, stream, args=args, arg_types=arg_types) - - cuda.cuCtxSynchronize() - - -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_512_bytes(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_chars") - ASSERT_DRV(err) - - args = [127] * 512 - arg_types = [ctypes.c_byte] * 512 - - args = tuple(args) - arg_types = tuple(arg_types) - - benchmark(launch, func, stream, args=args, arg_types=arg_types) - - cuda.cuCtxSynchronize() - - -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_512_longlongs(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_longlongs") - ASSERT_DRV(err) - - args = [9223372036854775806] * 512 - arg_types = [ctypes.c_longlong] * 512 - - args = tuple(args) - arg_types = tuple(arg_types) - - benchmark(launch, func, stream, args=args, arg_types=arg_types) - - cuda.cuCtxSynchronize() - - -# Measure launch latency with many parameters using builtin parameter packing -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_256_args(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_256_args") - ASSERT_DRV(err) - - args = [] - arg_types = [None] * 256 - for _ in arg_types: - err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) - ASSERT_DRV(err) - args.append(p) - - args = tuple(args) - arg_types = tuple(arg_types) - - benchmark(launch, func, stream, args=args, arg_types=arg_types) - - cuda.cuCtxSynchronize() - - for p in args: - (err,) = cuda.cuMemFree(p) - ASSERT_DRV(err) - - -# Measure launch latency with many parameters using builtin parameter packing -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_16_args(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_16_args") - ASSERT_DRV(err) - - args = [] - arg_types = [None] * 16 - for _ in arg_types: - err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) - ASSERT_DRV(err) - args.append(p) - - args = tuple(args) - arg_types = tuple(arg_types) - - benchmark(launch, func, stream, args=args, arg_types=arg_types) - - cuda.cuCtxSynchronize() - - for p in args: - (err,) = cuda.cuMemFree(p) - ASSERT_DRV(err) - - -# Measure launch latency with many parameters, excluding parameter packing -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_512_args_ctypes(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args") - ASSERT_DRV(err) - - vals = [] - val_ps = [] - for i in range(512): - err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) - ASSERT_DRV(err) - vals.append(p) - val_ps.append(ctypes.c_void_p(int(vals[i]))) - - packagedParams = (ctypes.c_void_p * 512)() - for i in range(512): - packagedParams[i] = ctypes.addressof(val_ps[i]) - - benchmark(launch_packed, func, stream, packagedParams) - - cuda.cuCtxSynchronize() - - for p in vals: - (err,) = cuda.cuMemFree(p) - ASSERT_DRV(err) - - -def pack_and_launch(kernel, stream, params): - packed_params = (ctypes.c_void_p * len(params))() - ptrs = [0] * len(params) - for i in range(len(params)): - ptrs[i] = ctypes.c_void_p(int(params[i])) - packed_params[i] = ctypes.addressof(ptrs[i]) - - cuda.cuLaunchKernel(kernel, 1, 1, 1, 1, 1, 1, 0, stream, packed_params, 0) - - -# Measure launch latency plus parameter packing using ctypes -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_512_args_ctypes_with_packing(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args") - ASSERT_DRV(err) - - vals = [] - for i in range(512): - err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) - ASSERT_DRV(err) - vals.append(p) - - benchmark(pack_and_launch, func, stream, vals) - - cuda.cuCtxSynchronize() - - for p in vals: - (err,) = cuda.cuMemFree(p) - ASSERT_DRV(err) - - -# Measure launch latency with a single large struct parameter -@pytest.mark.benchmark(group="launch-latency") -def test_launch_latency_small_kernel_2048B(benchmark, init_cuda, load_module): - device, ctx, stream = init_cuda - module = load_module(kernel_string, device) - - err, func = cuda.cuModuleGetFunction(module, b"small_kernel_2048B") - ASSERT_DRV(err) - - class struct_2048B(ctypes.Structure): - _fields_ = [("values", ctypes.c_uint8 * 2048)] - - benchmark(launch, func, stream, args=(struct_2048B(),), arg_types=(None,)) - - cuda.cuCtxSynchronize() +# Copyright 2021-2024 NVIDIA Corporation. All rights reserved. +# +# Please refer to the NVIDIA end user license agreement (EULA) associated +# with this source code for terms and conditions that govern your use of +# this software. Any use, reproduction, disclosure, or distribution of +# this software and related documentation outside the terms of the EULA +# is strictly prohibited. +import ctypes + +import pytest + +from cuda import cuda + +from .kernels import kernel_string +from .perf_test_utils import ASSERT_DRV + + +def launch(kernel, stream, args=(), arg_types=()): + cuda.cuLaunchKernel( + kernel, + 1, + 1, + 1, # grid dim + 1, + 1, + 1, # block dim + 0, + stream, # shared mem and stream + (args, arg_types), + 0, + ) # arguments + + +def launch_packed(kernel, stream, params): + cuda.cuLaunchKernel( + kernel, + 1, + 1, + 1, # grid dim + 1, + 1, + 1, # block dim + 0, + stream, # shared mem and stream + params, + 0, + ) # arguments + + +# Measure launch latency with no parmaeters +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_empty_kernel(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"empty_kernel") + ASSERT_DRV(err) + + benchmark(launch, func, stream) + + cuda.cuCtxSynchronize() + + +# Measure launch latency with a single parameter +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel") + ASSERT_DRV(err) + + err, f = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_float)) + ASSERT_DRV(err) + + benchmark(launch, func, stream, args=(f,), arg_types=(None,)) + + cuda.cuCtxSynchronize() + + (err,) = cuda.cuMemFree(f) + ASSERT_DRV(err) + + +# Measure launch latency with many parameters using builtin parameter packing +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_512_args(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args") + ASSERT_DRV(err) + + args = [] + arg_types = [None] * 512 + for _ in arg_types: + err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) + ASSERT_DRV(err) + args.append(p) + + args = tuple(args) + arg_types = tuple(arg_types) + + benchmark(launch, func, stream, args=args, arg_types=arg_types) + + cuda.cuCtxSynchronize() + + for p in args: + (err,) = cuda.cuMemFree(p) + ASSERT_DRV(err) + + +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_512_bools(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_bools") + ASSERT_DRV(err) + + args = [True] * 512 + arg_types = [ctypes.c_bool] * 512 + + args = tuple(args) + arg_types = tuple(arg_types) + + benchmark(launch, func, stream, args=args, arg_types=arg_types) + + cuda.cuCtxSynchronize() + + +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_512_doubles(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_doubles") + ASSERT_DRV(err) + + args = [1.2345] * 512 + arg_types = [ctypes.c_double] * 512 + + args = tuple(args) + arg_types = tuple(arg_types) + + benchmark(launch, func, stream, args=args, arg_types=arg_types) + + cuda.cuCtxSynchronize() + + +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_512_ints(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_ints") + ASSERT_DRV(err) + + args = [123] * 512 + arg_types = [ctypes.c_int] * 512 + + args = tuple(args) + arg_types = tuple(arg_types) + + benchmark(launch, func, stream, args=args, arg_types=arg_types) + + cuda.cuCtxSynchronize() + + +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_512_bytes(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_chars") + ASSERT_DRV(err) + + args = [127] * 512 + arg_types = [ctypes.c_byte] * 512 + + args = tuple(args) + arg_types = tuple(arg_types) + + benchmark(launch, func, stream, args=args, arg_types=arg_types) + + cuda.cuCtxSynchronize() + + +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_512_longlongs(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_longlongs") + ASSERT_DRV(err) + + args = [9223372036854775806] * 512 + arg_types = [ctypes.c_longlong] * 512 + + args = tuple(args) + arg_types = tuple(arg_types) + + benchmark(launch, func, stream, args=args, arg_types=arg_types) + + cuda.cuCtxSynchronize() + + +# Measure launch latency with many parameters using builtin parameter packing +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_256_args(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_256_args") + ASSERT_DRV(err) + + args = [] + arg_types = [None] * 256 + for _ in arg_types: + err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) + ASSERT_DRV(err) + args.append(p) + + args = tuple(args) + arg_types = tuple(arg_types) + + benchmark(launch, func, stream, args=args, arg_types=arg_types) + + cuda.cuCtxSynchronize() + + for p in args: + (err,) = cuda.cuMemFree(p) + ASSERT_DRV(err) + + +# Measure launch latency with many parameters using builtin parameter packing +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_16_args(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_16_args") + ASSERT_DRV(err) + + args = [] + arg_types = [None] * 16 + for _ in arg_types: + err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) + ASSERT_DRV(err) + args.append(p) + + args = tuple(args) + arg_types = tuple(arg_types) + + benchmark(launch, func, stream, args=args, arg_types=arg_types) + + cuda.cuCtxSynchronize() + + for p in args: + (err,) = cuda.cuMemFree(p) + ASSERT_DRV(err) + + +# Measure launch latency with many parameters, excluding parameter packing +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_512_args_ctypes(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args") + ASSERT_DRV(err) + + vals = [] + val_ps = [] + for i in range(512): + err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) + ASSERT_DRV(err) + vals.append(p) + val_ps.append(ctypes.c_void_p(int(vals[i]))) + + packagedParams = (ctypes.c_void_p * 512)() + for i in range(512): + packagedParams[i] = ctypes.addressof(val_ps[i]) + + benchmark(launch_packed, func, stream, packagedParams) + + cuda.cuCtxSynchronize() + + for p in vals: + (err,) = cuda.cuMemFree(p) + ASSERT_DRV(err) + + +def pack_and_launch(kernel, stream, params): + packed_params = (ctypes.c_void_p * len(params))() + ptrs = [0] * len(params) + for i in range(len(params)): + ptrs[i] = ctypes.c_void_p(int(params[i])) + packed_params[i] = ctypes.addressof(ptrs[i]) + + cuda.cuLaunchKernel(kernel, 1, 1, 1, 1, 1, 1, 0, stream, packed_params, 0) + + +# Measure launch latency plus parameter packing using ctypes +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_512_args_ctypes_with_packing(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_512_args") + ASSERT_DRV(err) + + vals = [] + for i in range(512): + err, p = cuda.cuMemAlloc(ctypes.sizeof(ctypes.c_int)) + ASSERT_DRV(err) + vals.append(p) + + benchmark(pack_and_launch, func, stream, vals) + + cuda.cuCtxSynchronize() + + for p in vals: + (err,) = cuda.cuMemFree(p) + ASSERT_DRV(err) + + +# Measure launch latency with a single large struct parameter +@pytest.mark.benchmark(group="launch-latency") +def test_launch_latency_small_kernel_2048B(benchmark, init_cuda, load_module): + device, ctx, stream = init_cuda + module = load_module(kernel_string, device) + + err, func = cuda.cuModuleGetFunction(module, b"small_kernel_2048B") + ASSERT_DRV(err) + + class struct_2048B(ctypes.Structure): + _fields_ = [("values", ctypes.c_uint8 * 2048)] + + benchmark(launch, func, stream, args=(struct_2048B(),), arg_types=(None,)) + + cuda.cuCtxSynchronize() diff --git a/cuda_bindings/tests/test_nvjitlink.py b/cuda_bindings/tests/test_nvjitlink.py index d92a3ca7..839c7be1 100644 --- a/cuda_bindings/tests/test_nvjitlink.py +++ b/cuda_bindings/tests/test_nvjitlink.py @@ -1,168 +1,168 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. -# -# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE - -import pytest - -from cuda.bindings import nvjitlink, nvrtc - -# Establish a handful of compatible architectures and PTX versions to test with -ARCHITECTURES = ["sm_60", "sm_75", "sm_80", "sm_90"] -PTX_VERSIONS = ["5.0", "6.4", "7.0", "8.5"] - - -def ptx_header(version, arch): - return f""" -.version {version} -.target {arch} -.address_size 64 -""" - - -ptx_kernel = """ -.visible .entry _Z6kernelPi( - .param .u64 _Z6kernelPi_param_0 -) -{ - .reg .pred %p<2>; - .reg .b32 %r<3>; - .reg .b64 %rd<3>; - - ld.param.u64 %rd1, [_Z6kernelPi_param_0]; - cvta.to.global.u64 %rd2, %rd1; - mov.u32 %r1, %tid.x; - st.global.u32 [%rd2+0], %r1; - ret; -} -""" - -minimal_ptx_kernel = """ -.func _MinimalKernel() -{ - ret; -} -""" - -ptx_kernel_bytes = [ - (ptx_header(version, arch) + ptx_kernel).encode("utf-8") for version, arch in zip(PTX_VERSIONS, ARCHITECTURES) -] -minimal_ptx_kernel_bytes = [ - (ptx_header(version, arch) + minimal_ptx_kernel).encode("utf-8") - for version, arch in zip(PTX_VERSIONS, ARCHITECTURES) -] - - -# create a valid LTOIR input for testing -@pytest.fixture -def get_dummy_ltoir(): - def CHECK_NVRTC(err): - if err != nvrtc.nvrtcResult.NVRTC_SUCCESS: - raise RuntimeError(f"Nvrtc Error: {err}") - - empty_cplusplus_kernel = "__global__ void A() {}" - err, program_handle = nvrtc.nvrtcCreateProgram(empty_cplusplus_kernel.encode(), b"", 0, [], []) - CHECK_NVRTC(err) - nvrtc.nvrtcCompileProgram(program_handle, 1, [b"-dlto"]) - err, size = nvrtc.nvrtcGetLTOIRSize(program_handle) - CHECK_NVRTC(err) - empty_kernel_ltoir = b" " * size - (err,) = nvrtc.nvrtcGetLTOIR(program_handle, empty_kernel_ltoir) - CHECK_NVRTC(err) - (err,) = nvrtc.nvrtcDestroyProgram(program_handle) - CHECK_NVRTC(err) - return empty_kernel_ltoir - - -def test_unrecognized_option_error(): - with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"): - nvjitlink.create(1, ["-fictitious_option"]) - - -def test_invalid_arch_error(): - with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"): - nvjitlink.create(1, ["-arch=sm_XX"]) - - -@pytest.mark.parametrize("option", ARCHITECTURES) -def test_create_and_destroy(option): - handle = nvjitlink.create(1, [f"-arch={option}"]) - assert handle != 0 - nvjitlink.destroy(handle) - - -@pytest.mark.parametrize("option", ARCHITECTURES) -def test_complete_empty(option): - handle = nvjitlink.create(1, [f"-arch={option}"]) - nvjitlink.complete(handle) - nvjitlink.destroy(handle) - - -@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes)) -def test_add_data(option, ptx_bytes): - handle = nvjitlink.create(1, [f"-arch={option}"]) - nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_bytes, len(ptx_bytes), "test_data") - nvjitlink.complete(handle) - nvjitlink.destroy(handle) - - -@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes)) -def test_add_file(option, ptx_bytes, tmp_path): - handle = nvjitlink.create(1, [f"-arch={option}"]) - file_path = tmp_path / "test_file.cubin" - file_path.write_bytes(ptx_bytes) - nvjitlink.add_file(handle, nvjitlink.InputType.ANY, str(file_path)) - nvjitlink.complete(handle) - nvjitlink.destroy(handle) - - -@pytest.mark.parametrize("option", ARCHITECTURES) -def test_get_error_log(option): - handle = nvjitlink.create(1, [f"-arch={option}"]) - nvjitlink.complete(handle) - log_size = nvjitlink.get_error_log_size(handle) - log = bytearray(log_size) - nvjitlink.get_error_log(handle, log) - assert len(log) == log_size - nvjitlink.destroy(handle) - - -@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes)) -def test_get_info_log(option, ptx_bytes): - handle = nvjitlink.create(1, [f"-arch={option}"]) - nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_bytes, len(ptx_bytes), "test_data") - nvjitlink.complete(handle) - log_size = nvjitlink.get_info_log_size(handle) - log = bytearray(log_size) - nvjitlink.get_info_log(handle, log) - assert len(log) == log_size - nvjitlink.destroy(handle) - - -@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes)) -def test_get_linked_cubin(option, ptx_bytes): - handle = nvjitlink.create(1, [f"-arch={option}"]) - nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_bytes, len(ptx_bytes), "test_data") - nvjitlink.complete(handle) - cubin_size = nvjitlink.get_linked_cubin_size(handle) - cubin = bytearray(cubin_size) - nvjitlink.get_linked_cubin(handle, cubin) - assert len(cubin) == cubin_size - nvjitlink.destroy(handle) - - -@pytest.mark.parametrize("option", ARCHITECTURES) -def test_get_linked_ptx(option, get_dummy_ltoir): - handle = nvjitlink.create(3, [f"-arch={option}", "-lto", "-ptx"]) - nvjitlink.add_data(handle, nvjitlink.InputType.LTOIR, get_dummy_ltoir, len(get_dummy_ltoir), "test_data") - nvjitlink.complete(handle) - ptx_size = nvjitlink.get_linked_ptx_size(handle) - ptx = bytearray(ptx_size) - nvjitlink.get_linked_ptx(handle, ptx) - assert len(ptx) == ptx_size - nvjitlink.destroy(handle) - - -def test_package_version(): - ver = nvjitlink.version() - assert len(ver) == 2 - assert ver >= (12, 0) +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +import pytest + +from cuda.bindings import nvjitlink, nvrtc + +# Establish a handful of compatible architectures and PTX versions to test with +ARCHITECTURES = ["sm_60", "sm_75", "sm_80", "sm_90"] +PTX_VERSIONS = ["5.0", "6.4", "7.0", "8.5"] + + +def ptx_header(version, arch): + return f""" +.version {version} +.target {arch} +.address_size 64 +""" + + +ptx_kernel = """ +.visible .entry _Z6kernelPi( + .param .u64 _Z6kernelPi_param_0 +) +{ + .reg .pred %p<2>; + .reg .b32 %r<3>; + .reg .b64 %rd<3>; + + ld.param.u64 %rd1, [_Z6kernelPi_param_0]; + cvta.to.global.u64 %rd2, %rd1; + mov.u32 %r1, %tid.x; + st.global.u32 [%rd2+0], %r1; + ret; +} +""" + +minimal_ptx_kernel = """ +.func _MinimalKernel() +{ + ret; +} +""" + +ptx_kernel_bytes = [ + (ptx_header(version, arch) + ptx_kernel).encode("utf-8") for version, arch in zip(PTX_VERSIONS, ARCHITECTURES) +] +minimal_ptx_kernel_bytes = [ + (ptx_header(version, arch) + minimal_ptx_kernel).encode("utf-8") + for version, arch in zip(PTX_VERSIONS, ARCHITECTURES) +] + + +# create a valid LTOIR input for testing +@pytest.fixture +def get_dummy_ltoir(): + def CHECK_NVRTC(err): + if err != nvrtc.nvrtcResult.NVRTC_SUCCESS: + raise RuntimeError(f"Nvrtc Error: {err}") + + empty_cplusplus_kernel = "__global__ void A() {}" + err, program_handle = nvrtc.nvrtcCreateProgram(empty_cplusplus_kernel.encode(), b"", 0, [], []) + CHECK_NVRTC(err) + nvrtc.nvrtcCompileProgram(program_handle, 1, [b"-dlto"]) + err, size = nvrtc.nvrtcGetLTOIRSize(program_handle) + CHECK_NVRTC(err) + empty_kernel_ltoir = b" " * size + (err,) = nvrtc.nvrtcGetLTOIR(program_handle, empty_kernel_ltoir) + CHECK_NVRTC(err) + (err,) = nvrtc.nvrtcDestroyProgram(program_handle) + CHECK_NVRTC(err) + return empty_kernel_ltoir + + +def test_unrecognized_option_error(): + with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"): + nvjitlink.create(1, ["-fictitious_option"]) + + +def test_invalid_arch_error(): + with pytest.raises(nvjitlink.nvJitLinkError, match="ERROR_UNRECOGNIZED_OPTION"): + nvjitlink.create(1, ["-arch=sm_XX"]) + + +@pytest.mark.parametrize("option", ARCHITECTURES) +def test_create_and_destroy(option): + handle = nvjitlink.create(1, [f"-arch={option}"]) + assert handle != 0 + nvjitlink.destroy(handle) + + +@pytest.mark.parametrize("option", ARCHITECTURES) +def test_complete_empty(option): + handle = nvjitlink.create(1, [f"-arch={option}"]) + nvjitlink.complete(handle) + nvjitlink.destroy(handle) + + +@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes)) +def test_add_data(option, ptx_bytes): + handle = nvjitlink.create(1, [f"-arch={option}"]) + nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_bytes, len(ptx_bytes), "test_data") + nvjitlink.complete(handle) + nvjitlink.destroy(handle) + + +@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes)) +def test_add_file(option, ptx_bytes, tmp_path): + handle = nvjitlink.create(1, [f"-arch={option}"]) + file_path = tmp_path / "test_file.cubin" + file_path.write_bytes(ptx_bytes) + nvjitlink.add_file(handle, nvjitlink.InputType.ANY, str(file_path)) + nvjitlink.complete(handle) + nvjitlink.destroy(handle) + + +@pytest.mark.parametrize("option", ARCHITECTURES) +def test_get_error_log(option): + handle = nvjitlink.create(1, [f"-arch={option}"]) + nvjitlink.complete(handle) + log_size = nvjitlink.get_error_log_size(handle) + log = bytearray(log_size) + nvjitlink.get_error_log(handle, log) + assert len(log) == log_size + nvjitlink.destroy(handle) + + +@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes)) +def test_get_info_log(option, ptx_bytes): + handle = nvjitlink.create(1, [f"-arch={option}"]) + nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_bytes, len(ptx_bytes), "test_data") + nvjitlink.complete(handle) + log_size = nvjitlink.get_info_log_size(handle) + log = bytearray(log_size) + nvjitlink.get_info_log(handle, log) + assert len(log) == log_size + nvjitlink.destroy(handle) + + +@pytest.mark.parametrize("option, ptx_bytes", zip(ARCHITECTURES, ptx_kernel_bytes)) +def test_get_linked_cubin(option, ptx_bytes): + handle = nvjitlink.create(1, [f"-arch={option}"]) + nvjitlink.add_data(handle, nvjitlink.InputType.ANY, ptx_bytes, len(ptx_bytes), "test_data") + nvjitlink.complete(handle) + cubin_size = nvjitlink.get_linked_cubin_size(handle) + cubin = bytearray(cubin_size) + nvjitlink.get_linked_cubin(handle, cubin) + assert len(cubin) == cubin_size + nvjitlink.destroy(handle) + + +@pytest.mark.parametrize("option", ARCHITECTURES) +def test_get_linked_ptx(option, get_dummy_ltoir): + handle = nvjitlink.create(3, [f"-arch={option}", "-lto", "-ptx"]) + nvjitlink.add_data(handle, nvjitlink.InputType.LTOIR, get_dummy_ltoir, len(get_dummy_ltoir), "test_data") + nvjitlink.complete(handle) + ptx_size = nvjitlink.get_linked_ptx_size(handle) + ptx = bytearray(ptx_size) + nvjitlink.get_linked_ptx(handle, ptx) + assert len(ptx) == ptx_size + nvjitlink.destroy(handle) + + +def test_package_version(): + ver = nvjitlink.version() + assert len(ver) == 2 + assert ver >= (12, 0) diff --git a/cuda_core/tests/example_tests/test_basic_examples.py b/cuda_core/tests/example_tests/test_basic_examples.py index 9b94ecd3..9a9432cb 100644 --- a/cuda_core/tests/example_tests/test_basic_examples.py +++ b/cuda_core/tests/example_tests/test_basic_examples.py @@ -1,25 +1,25 @@ -# Copyright 2024 NVIDIA Corporation. All rights reserved. -# -# Please refer to the NVIDIA end user license agreement (EULA) associated -# with this source code for terms and conditions that govern your use of -# this software. Any use, reproduction, disclosure, or distribution of -# this software and related documentation outside the terms of the EULA -# is strictly prohibited. - -# If we have subcategories of examples in the future, this file can be split along those lines - -import glob -import os - -import pytest - -from .utils import run_example - -samples_path = os.path.join(os.path.dirname(__file__), "..", "..", "examples") -sample_files = glob.glob(samples_path + "**/*.py", recursive=True) - - -@pytest.mark.parametrize("example", sample_files) -class TestExamples: - def test_example(self, example, deinit_cuda): - run_example(samples_path, example) +# Copyright 2024 NVIDIA Corporation. All rights reserved. +# +# Please refer to the NVIDIA end user license agreement (EULA) associated +# with this source code for terms and conditions that govern your use of +# this software. Any use, reproduction, disclosure, or distribution of +# this software and related documentation outside the terms of the EULA +# is strictly prohibited. + +# If we have subcategories of examples in the future, this file can be split along those lines + +import glob +import os + +import pytest + +from .utils import run_example + +samples_path = os.path.join(os.path.dirname(__file__), "..", "..", "examples") +sample_files = glob.glob(samples_path + "**/*.py", recursive=True) + + +@pytest.mark.parametrize("example", sample_files) +class TestExamples: + def test_example(self, example, deinit_cuda): + run_example(samples_path, example) diff --git a/cuda_core/tests/example_tests/utils.py b/cuda_core/tests/example_tests/utils.py index f6ac3e15..3d218a91 100644 --- a/cuda_core/tests/example_tests/utils.py +++ b/cuda_core/tests/example_tests/utils.py @@ -1,56 +1,56 @@ -# Copyright 2024 NVIDIA Corporation. All rights reserved. -# -# Please refer to the NVIDIA end user license agreement (EULA) associated -# with this source code for terms and conditions that govern your use of -# this software. Any use, reproduction, disclosure, or distribution of -# this software and related documentation outside the terms of the EULA -# is strictly prohibited. - -import gc -import os -import sys - -import cupy as cp -import pytest - - -class SampleTestError(Exception): - pass - - -def parse_python_script(filepath): - if not filepath.endswith(".py"): - raise ValueError(f"{filepath} not supported") - with open(filepath, encoding="utf-8") as f: - script = f.read() - return script - - -def run_example(samples_path, filename, env=None): - fullpath = os.path.join(samples_path, filename) - script = parse_python_script(fullpath) - try: - old_argv = sys.argv - sys.argv = [fullpath] - old_sys_path = sys.path.copy() - sys.path.append(samples_path) - exec(script, env if env else {}) - except ImportError as e: - # for samples requiring any of optional dependencies - for m in ("cupy",): - if f"No module named '{m}'" in str(e): - pytest.skip(f"{m} not installed, skipping related tests") - break - else: - raise - except Exception as e: - msg = "\n" - msg += f"Got error ({filename}):\n" - msg += str(e) - raise SampleTestError(msg) from e - finally: - sys.path = old_sys_path - sys.argv = old_argv - # further reduce the memory watermark - gc.collect() - cp.get_default_memory_pool().free_all_blocks() +# Copyright 2024 NVIDIA Corporation. All rights reserved. +# +# Please refer to the NVIDIA end user license agreement (EULA) associated +# with this source code for terms and conditions that govern your use of +# this software. Any use, reproduction, disclosure, or distribution of +# this software and related documentation outside the terms of the EULA +# is strictly prohibited. + +import gc +import os +import sys + +import cupy as cp +import pytest + + +class SampleTestError(Exception): + pass + + +def parse_python_script(filepath): + if not filepath.endswith(".py"): + raise ValueError(f"{filepath} not supported") + with open(filepath, encoding="utf-8") as f: + script = f.read() + return script + + +def run_example(samples_path, filename, env=None): + fullpath = os.path.join(samples_path, filename) + script = parse_python_script(fullpath) + try: + old_argv = sys.argv + sys.argv = [fullpath] + old_sys_path = sys.path.copy() + sys.path.append(samples_path) + exec(script, env if env else {}) + except ImportError as e: + # for samples requiring any of optional dependencies + for m in ("cupy",): + if f"No module named '{m}'" in str(e): + pytest.skip(f"{m} not installed, skipping related tests") + break + else: + raise + except Exception as e: + msg = "\n" + msg += f"Got error ({filename}):\n" + msg += str(e) + raise SampleTestError(msg) from e + finally: + sys.path = old_sys_path + sys.argv = old_argv + # further reduce the memory watermark + gc.collect() + cp.get_default_memory_pool().free_all_blocks() diff --git a/cuda_core/tests/test_device.py b/cuda_core/tests/test_device.py index afc3ed5b..876299f3 100644 --- a/cuda_core/tests/test_device.py +++ b/cuda_core/tests/test_device.py @@ -1,80 +1,80 @@ -# Copyright 2024 NVIDIA Corporation. All rights reserved. -# -# Please refer to the NVIDIA end user license agreement (EULA) associated -# with this source code for terms and conditions that govern your use of -# this software. Any use, reproduction, disclosure, or distribution of -# this software and related documentation outside the terms of the EULA -# is strictly prohibited. - -try: - from cuda.bindings import driver, runtime -except ImportError: - from cuda import cuda as driver - from cuda import cudart as runtime - -from cuda.core.experimental import Device -from cuda.core.experimental._utils import ComputeCapability, handle_return - - -def test_device_set_current(deinit_cuda): - device = Device() - device.set_current() - assert handle_return(driver.cuCtxGetCurrent()) is not None - - -def test_device_repr(): - device = Device(0) - assert str(device).startswith("= 11040: - uuid = handle_return(driver.cuDeviceGetUuid_v2(device.device_id)) - else: - uuid = handle_return(driver.cuDeviceGetUuid(device.device_id)) - uuid = uuid.bytes.hex() - expected_uuid = f"{uuid[:8]}-{uuid[8:12]}-{uuid[12:16]}-{uuid[16:20]}-{uuid[20:]}" - assert device.uuid == expected_uuid - - -def test_name(): - device = Device() - name = handle_return(driver.cuDeviceGetName(128, device.device_id)) - name = name.split(b"\0")[0] - assert device.name == name.decode() - - -def test_compute_capability(): - device = Device() - major = handle_return( - runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device.device_id) - ) - minor = handle_return( - runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device.device_id) - ) - expected_cc = ComputeCapability(major, minor) - assert device.compute_capability == expected_cc +# Copyright 2024 NVIDIA Corporation. All rights reserved. +# +# Please refer to the NVIDIA end user license agreement (EULA) associated +# with this source code for terms and conditions that govern your use of +# this software. Any use, reproduction, disclosure, or distribution of +# this software and related documentation outside the terms of the EULA +# is strictly prohibited. + +try: + from cuda.bindings import driver, runtime +except ImportError: + from cuda import cuda as driver + from cuda import cudart as runtime + +from cuda.core.experimental import Device +from cuda.core.experimental._utils import ComputeCapability, handle_return + + +def test_device_set_current(deinit_cuda): + device = Device() + device.set_current() + assert handle_return(driver.cuCtxGetCurrent()) is not None + + +def test_device_repr(): + device = Device(0) + assert str(device).startswith("= 11040: + uuid = handle_return(driver.cuDeviceGetUuid_v2(device.device_id)) + else: + uuid = handle_return(driver.cuDeviceGetUuid(device.device_id)) + uuid = uuid.bytes.hex() + expected_uuid = f"{uuid[:8]}-{uuid[8:12]}-{uuid[12:16]}-{uuid[16:20]}-{uuid[20:]}" + assert device.uuid == expected_uuid + + +def test_name(): + device = Device() + name = handle_return(driver.cuDeviceGetName(128, device.device_id)) + name = name.split(b"\0")[0] + assert device.name == name.decode() + + +def test_compute_capability(): + device = Device() + major = handle_return( + runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device.device_id) + ) + minor = handle_return( + runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device.device_id) + ) + expected_cc = ComputeCapability(major, minor) + assert device.compute_capability == expected_cc diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index 21548078..0d650b4f 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -1,46 +1,46 @@ -# Copyright 2024 NVIDIA Corporation. All rights reserved. -# -# Please refer to the NVIDIA end user license agreement (EULA) associated -# with this source code for terms and conditions that govern your use of -# this software. Any use, reproduction, disclosure, or distribution of -# this software and related documentation outside the terms of the EULA -# is strictly prohibited. - -import pytest - -from cuda.core.experimental import Device, EventOptions - - -@pytest.mark.parametrize("enable_timing", [True, False, None]) -def test_timing(init_cuda, enable_timing): - options = EventOptions(enable_timing=enable_timing) - stream = Device().create_stream() - event = stream.record(options=options) - assert event.is_timing_disabled == (not enable_timing if enable_timing is not None else True) - - -def test_is_sync_busy_waited(init_cuda): - options = EventOptions(enable_timing=False, busy_waited_sync=True) - stream = Device().create_stream() - event = stream.record(options=options) - assert event.is_sync_busy_waited is True - - options = EventOptions(enable_timing=False) - stream = Device().create_stream() - event = stream.record(options=options) - assert event.is_sync_busy_waited is False - - -def test_sync(init_cuda): - options = EventOptions(enable_timing=False) - stream = Device().create_stream() - event = stream.record(options=options) - event.sync() - assert event.is_done is True - - -def test_is_done(init_cuda): - options = EventOptions(enable_timing=False) - stream = Device().create_stream() - event = stream.record(options=options) - assert event.is_done is True +# Copyright 2024 NVIDIA Corporation. All rights reserved. +# +# Please refer to the NVIDIA end user license agreement (EULA) associated +# with this source code for terms and conditions that govern your use of +# this software. Any use, reproduction, disclosure, or distribution of +# this software and related documentation outside the terms of the EULA +# is strictly prohibited. + +import pytest + +from cuda.core.experimental import Device, EventOptions + + +@pytest.mark.parametrize("enable_timing", [True, False, None]) +def test_timing(init_cuda, enable_timing): + options = EventOptions(enable_timing=enable_timing) + stream = Device().create_stream() + event = stream.record(options=options) + assert event.is_timing_disabled == (not enable_timing if enable_timing is not None else True) + + +def test_is_sync_busy_waited(init_cuda): + options = EventOptions(enable_timing=False, busy_waited_sync=True) + stream = Device().create_stream() + event = stream.record(options=options) + assert event.is_sync_busy_waited is True + + options = EventOptions(enable_timing=False) + stream = Device().create_stream() + event = stream.record(options=options) + assert event.is_sync_busy_waited is False + + +def test_sync(init_cuda): + options = EventOptions(enable_timing=False) + stream = Device().create_stream() + event = stream.record(options=options) + event.sync() + assert event.is_done is True + + +def test_is_done(init_cuda): + options = EventOptions(enable_timing=False) + stream = Device().create_stream() + event = stream.record(options=options) + assert event.is_done is True diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py index 874d7f07..08f7e6d3 100644 --- a/cuda_core/tests/test_launcher.py +++ b/cuda_core/tests/test_launcher.py @@ -1,68 +1,68 @@ -# Copyright 2024 NVIDIA Corporation. All rights reserved. -# -# Please refer to the NVIDIA end user license agreement (EULA) associated -# with this source code for terms and conditions that govern your use of -# this software. Any use, reproduction, disclosure, or distribution of -# this software and related documentation outside the terms of the EULA -# is strictly prohibited. - -import pytest - -from cuda.core.experimental import Device, LaunchConfig, Stream - - -def test_launch_config_init(init_cuda): - config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None, shmem_size=0) - assert config.grid == (1, 1, 1) - assert config.block == (1, 1, 1) - assert config.stream is None - assert config.shmem_size == 0 - - config = LaunchConfig(grid=(2, 2, 2), block=(2, 2, 2), stream=Device().create_stream(), shmem_size=1024) - assert config.grid == (2, 2, 2) - assert config.block == (2, 2, 2) - assert isinstance(config.stream, Stream) - assert config.shmem_size == 1024 - - -def test_launch_config_cast_to_3_tuple(): - config = LaunchConfig(grid=1, block=1) - assert config._cast_to_3_tuple(1) == (1, 1, 1) - assert config._cast_to_3_tuple((1, 2)) == (1, 2, 1) - assert config._cast_to_3_tuple((1, 2, 3)) == (1, 2, 3) - - # Edge cases - assert config._cast_to_3_tuple(999) == (999, 1, 1) - assert config._cast_to_3_tuple((999, 888)) == (999, 888, 1) - assert config._cast_to_3_tuple((999, 888, 777)) == (999, 888, 777) - - -def test_launch_config_invalid_values(): - with pytest.raises(ValueError): - LaunchConfig(grid=0, block=1) - - with pytest.raises(ValueError): - LaunchConfig(grid=(0, 1), block=1) - - with pytest.raises(ValueError): - LaunchConfig(grid=(1, 1, 1), block=0) - - with pytest.raises(ValueError): - LaunchConfig(grid=(1, 1, 1), block=(0, 1)) - - -def test_launch_config_stream(init_cuda): - stream = Device().create_stream() - config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=stream, shmem_size=0) - assert config.stream == stream - - with pytest.raises(ValueError): - LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream="invalid_stream", shmem_size=0) - - -def test_launch_config_shmem_size(): - config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None, shmem_size=2048) - assert config.shmem_size == 2048 - - config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None) - assert config.shmem_size == 0 +# Copyright 2024 NVIDIA Corporation. All rights reserved. +# +# Please refer to the NVIDIA end user license agreement (EULA) associated +# with this source code for terms and conditions that govern your use of +# this software. Any use, reproduction, disclosure, or distribution of +# this software and related documentation outside the terms of the EULA +# is strictly prohibited. + +import pytest + +from cuda.core.experimental import Device, LaunchConfig, Stream + + +def test_launch_config_init(init_cuda): + config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None, shmem_size=0) + assert config.grid == (1, 1, 1) + assert config.block == (1, 1, 1) + assert config.stream is None + assert config.shmem_size == 0 + + config = LaunchConfig(grid=(2, 2, 2), block=(2, 2, 2), stream=Device().create_stream(), shmem_size=1024) + assert config.grid == (2, 2, 2) + assert config.block == (2, 2, 2) + assert isinstance(config.stream, Stream) + assert config.shmem_size == 1024 + + +def test_launch_config_cast_to_3_tuple(): + config = LaunchConfig(grid=1, block=1) + assert config._cast_to_3_tuple(1) == (1, 1, 1) + assert config._cast_to_3_tuple((1, 2)) == (1, 2, 1) + assert config._cast_to_3_tuple((1, 2, 3)) == (1, 2, 3) + + # Edge cases + assert config._cast_to_3_tuple(999) == (999, 1, 1) + assert config._cast_to_3_tuple((999, 888)) == (999, 888, 1) + assert config._cast_to_3_tuple((999, 888, 777)) == (999, 888, 777) + + +def test_launch_config_invalid_values(): + with pytest.raises(ValueError): + LaunchConfig(grid=0, block=1) + + with pytest.raises(ValueError): + LaunchConfig(grid=(0, 1), block=1) + + with pytest.raises(ValueError): + LaunchConfig(grid=(1, 1, 1), block=0) + + with pytest.raises(ValueError): + LaunchConfig(grid=(1, 1, 1), block=(0, 1)) + + +def test_launch_config_stream(init_cuda): + stream = Device().create_stream() + config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=stream, shmem_size=0) + assert config.stream == stream + + with pytest.raises(ValueError): + LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream="invalid_stream", shmem_size=0) + + +def test_launch_config_shmem_size(): + config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None, shmem_size=2048) + assert config.shmem_size == 2048 + + config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1), stream=None) + assert config.shmem_size == 0 diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index c78b5673..a48db69b 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1,213 +1,213 @@ -# Copyright 2024 NVIDIA Corporation. All rights reserved. -# -# Please refer to the NVIDIA end user license agreement (EULA) associated -# with this source code for terms and conditions that govern your use of -# this software. Any use, reproduction, disclosure, or distribution of -# this software and related documentation outside the terms of the EULA -# is strictly prohibited. - -try: - from cuda.bindings import driver -except ImportError: - from cuda import cuda as driver - -import ctypes - -from cuda.core.experimental import Device -from cuda.core.experimental._memory import Buffer, MemoryResource -from cuda.core.experimental._utils import handle_return - - -class DummyDeviceMemoryResource(MemoryResource): - def __init__(self, device): - self.device = device - - def allocate(self, size, stream=None) -> Buffer: - ptr = handle_return(driver.cuMemAlloc(size)) - return Buffer(ptr=ptr, size=size, mr=self) - - def deallocate(self, ptr, size, stream=None): - handle_return(driver.cuMemFree(ptr)) - - @property - def is_device_accessible(self) -> bool: - return True - - @property - def is_host_accessible(self) -> bool: - return False - - @property - def device_id(self) -> int: - return 0 - - -class DummyHostMemoryResource(MemoryResource): - def __init__(self): - pass - - def allocate(self, size, stream=None) -> Buffer: - # Allocate a ctypes buffer of size `size` - ptr = (ctypes.c_byte * size)() - return Buffer(ptr=ptr, size=size, mr=self) - - def deallocate(self, ptr, size, stream=None): - # the memory is deallocated per the ctypes deallocation at garbage collection time - pass - - @property - def is_device_accessible(self) -> bool: - return False - - @property - def is_host_accessible(self) -> bool: - return True - - @property - def device_id(self) -> int: - raise RuntimeError("the pinned memory resource is not bound to any GPU") - - -class DummyUnifiedMemoryResource(MemoryResource): - def __init__(self, device): - self.device = device - - def allocate(self, size, stream=None) -> Buffer: - ptr = handle_return(driver.cuMemAllocManaged(size, driver.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value)) - return Buffer(ptr=ptr, size=size, mr=self) - - def deallocate(self, ptr, size, stream=None): - handle_return(driver.cuMemFree(ptr)) - - @property - def is_device_accessible(self) -> bool: - return True - - @property - def is_host_accessible(self) -> bool: - return True - - @property - def device_id(self) -> int: - return 0 - - -class DummyPinnedMemoryResource(MemoryResource): - def __init__(self, device): - self.device = device - - def allocate(self, size, stream=None) -> Buffer: - ptr = handle_return(driver.cuMemAllocHost(size)) - return Buffer(ptr=ptr, size=size, mr=self) - - def deallocate(self, ptr, size, stream=None): - handle_return(driver.cuMemFreeHost(ptr)) - - @property - def is_device_accessible(self) -> bool: - return True - - @property - def is_host_accessible(self) -> bool: - return True - - @property - def device_id(self) -> int: - raise RuntimeError("the pinned memory resource is not bound to any GPU") - - -def buffer_initialization(dummy_mr: MemoryResource): - buffer = dummy_mr.allocate(size=1024) - assert buffer.handle != 0 - assert buffer.size == 1024 - assert buffer.memory_resource == dummy_mr - assert buffer.is_device_accessible == dummy_mr.is_device_accessible - assert buffer.is_host_accessible == dummy_mr.is_host_accessible - buffer.close() - - -def test_buffer_initialization(): - device = Device() - device.set_current() - buffer_initialization(DummyDeviceMemoryResource(device)) - buffer_initialization(DummyHostMemoryResource()) - buffer_initialization(DummyUnifiedMemoryResource(device)) - buffer_initialization(DummyPinnedMemoryResource(device)) - - -def buffer_copy_to(dummy_mr: MemoryResource, device: Device, check=False): - src_buffer = dummy_mr.allocate(size=1024) - dst_buffer = dummy_mr.allocate(size=1024) - stream = device.create_stream() - - if check: - src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte)) - for i in range(1024): - src_ptr[i] = ctypes.c_byte(i) - - src_buffer.copy_to(dst_buffer, stream=stream) - device.sync() - - if check: - dst_ptr = ctypes.cast(dst_buffer.handle, ctypes.POINTER(ctypes.c_byte)) - - for i in range(10): - assert dst_ptr[i] == src_ptr[i] - - dst_buffer.close() - src_buffer.close() - - -def test_buffer_copy_to(): - device = Device() - device.set_current() - buffer_copy_to(DummyDeviceMemoryResource(device), device) - buffer_copy_to(DummyUnifiedMemoryResource(device), device) - buffer_copy_to(DummyPinnedMemoryResource(device), device, check=True) - - -def buffer_copy_from(dummy_mr: MemoryResource, device, check=False): - src_buffer = dummy_mr.allocate(size=1024) - dst_buffer = dummy_mr.allocate(size=1024) - stream = device.create_stream() - - if check: - src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte)) - for i in range(1024): - src_ptr[i] = ctypes.c_byte(i) - - dst_buffer.copy_from(src_buffer, stream=stream) - device.sync() - - if check: - dst_ptr = ctypes.cast(dst_buffer.handle, ctypes.POINTER(ctypes.c_byte)) - - for i in range(10): - assert dst_ptr[i] == src_ptr[i] - - dst_buffer.close() - src_buffer.close() - - -def test_buffer_copy_from(): - device = Device() - device.set_current() - buffer_copy_from(DummyDeviceMemoryResource(device), device) - buffer_copy_from(DummyUnifiedMemoryResource(device), device) - buffer_copy_from(DummyPinnedMemoryResource(device), device, check=True) - - -def buffer_close(dummy_mr: MemoryResource): - buffer = dummy_mr.allocate(size=1024) - buffer.close() - assert buffer.handle == 0 - assert buffer.memory_resource is None - - -def test_buffer_close(): - device = Device() - device.set_current() - buffer_close(DummyDeviceMemoryResource(device)) - buffer_close(DummyHostMemoryResource()) - buffer_close(DummyUnifiedMemoryResource(device)) - buffer_close(DummyPinnedMemoryResource(device)) +# Copyright 2024 NVIDIA Corporation. All rights reserved. +# +# Please refer to the NVIDIA end user license agreement (EULA) associated +# with this source code for terms and conditions that govern your use of +# this software. Any use, reproduction, disclosure, or distribution of +# this software and related documentation outside the terms of the EULA +# is strictly prohibited. + +try: + from cuda.bindings import driver +except ImportError: + from cuda import cuda as driver + +import ctypes + +from cuda.core.experimental import Device +from cuda.core.experimental._memory import Buffer, MemoryResource +from cuda.core.experimental._utils import handle_return + + +class DummyDeviceMemoryResource(MemoryResource): + def __init__(self, device): + self.device = device + + def allocate(self, size, stream=None) -> Buffer: + ptr = handle_return(driver.cuMemAlloc(size)) + return Buffer(ptr=ptr, size=size, mr=self) + + def deallocate(self, ptr, size, stream=None): + handle_return(driver.cuMemFree(ptr)) + + @property + def is_device_accessible(self) -> bool: + return True + + @property + def is_host_accessible(self) -> bool: + return False + + @property + def device_id(self) -> int: + return 0 + + +class DummyHostMemoryResource(MemoryResource): + def __init__(self): + pass + + def allocate(self, size, stream=None) -> Buffer: + # Allocate a ctypes buffer of size `size` + ptr = (ctypes.c_byte * size)() + return Buffer(ptr=ptr, size=size, mr=self) + + def deallocate(self, ptr, size, stream=None): + # the memory is deallocated per the ctypes deallocation at garbage collection time + pass + + @property + def is_device_accessible(self) -> bool: + return False + + @property + def is_host_accessible(self) -> bool: + return True + + @property + def device_id(self) -> int: + raise RuntimeError("the pinned memory resource is not bound to any GPU") + + +class DummyUnifiedMemoryResource(MemoryResource): + def __init__(self, device): + self.device = device + + def allocate(self, size, stream=None) -> Buffer: + ptr = handle_return(driver.cuMemAllocManaged(size, driver.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value)) + return Buffer(ptr=ptr, size=size, mr=self) + + def deallocate(self, ptr, size, stream=None): + handle_return(driver.cuMemFree(ptr)) + + @property + def is_device_accessible(self) -> bool: + return True + + @property + def is_host_accessible(self) -> bool: + return True + + @property + def device_id(self) -> int: + return 0 + + +class DummyPinnedMemoryResource(MemoryResource): + def __init__(self, device): + self.device = device + + def allocate(self, size, stream=None) -> Buffer: + ptr = handle_return(driver.cuMemAllocHost(size)) + return Buffer(ptr=ptr, size=size, mr=self) + + def deallocate(self, ptr, size, stream=None): + handle_return(driver.cuMemFreeHost(ptr)) + + @property + def is_device_accessible(self) -> bool: + return True + + @property + def is_host_accessible(self) -> bool: + return True + + @property + def device_id(self) -> int: + raise RuntimeError("the pinned memory resource is not bound to any GPU") + + +def buffer_initialization(dummy_mr: MemoryResource): + buffer = dummy_mr.allocate(size=1024) + assert buffer.handle != 0 + assert buffer.size == 1024 + assert buffer.memory_resource == dummy_mr + assert buffer.is_device_accessible == dummy_mr.is_device_accessible + assert buffer.is_host_accessible == dummy_mr.is_host_accessible + buffer.close() + + +def test_buffer_initialization(): + device = Device() + device.set_current() + buffer_initialization(DummyDeviceMemoryResource(device)) + buffer_initialization(DummyHostMemoryResource()) + buffer_initialization(DummyUnifiedMemoryResource(device)) + buffer_initialization(DummyPinnedMemoryResource(device)) + + +def buffer_copy_to(dummy_mr: MemoryResource, device: Device, check=False): + src_buffer = dummy_mr.allocate(size=1024) + dst_buffer = dummy_mr.allocate(size=1024) + stream = device.create_stream() + + if check: + src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte)) + for i in range(1024): + src_ptr[i] = ctypes.c_byte(i) + + src_buffer.copy_to(dst_buffer, stream=stream) + device.sync() + + if check: + dst_ptr = ctypes.cast(dst_buffer.handle, ctypes.POINTER(ctypes.c_byte)) + + for i in range(10): + assert dst_ptr[i] == src_ptr[i] + + dst_buffer.close() + src_buffer.close() + + +def test_buffer_copy_to(): + device = Device() + device.set_current() + buffer_copy_to(DummyDeviceMemoryResource(device), device) + buffer_copy_to(DummyUnifiedMemoryResource(device), device) + buffer_copy_to(DummyPinnedMemoryResource(device), device, check=True) + + +def buffer_copy_from(dummy_mr: MemoryResource, device, check=False): + src_buffer = dummy_mr.allocate(size=1024) + dst_buffer = dummy_mr.allocate(size=1024) + stream = device.create_stream() + + if check: + src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte)) + for i in range(1024): + src_ptr[i] = ctypes.c_byte(i) + + dst_buffer.copy_from(src_buffer, stream=stream) + device.sync() + + if check: + dst_ptr = ctypes.cast(dst_buffer.handle, ctypes.POINTER(ctypes.c_byte)) + + for i in range(10): + assert dst_ptr[i] == src_ptr[i] + + dst_buffer.close() + src_buffer.close() + + +def test_buffer_copy_from(): + device = Device() + device.set_current() + buffer_copy_from(DummyDeviceMemoryResource(device), device) + buffer_copy_from(DummyUnifiedMemoryResource(device), device) + buffer_copy_from(DummyPinnedMemoryResource(device), device, check=True) + + +def buffer_close(dummy_mr: MemoryResource): + buffer = dummy_mr.allocate(size=1024) + buffer.close() + assert buffer.handle == 0 + assert buffer.memory_resource is None + + +def test_buffer_close(): + device = Device() + device.set_current() + buffer_close(DummyDeviceMemoryResource(device)) + buffer_close(DummyHostMemoryResource()) + buffer_close(DummyUnifiedMemoryResource(device)) + buffer_close(DummyPinnedMemoryResource(device)) diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index 5f0b6056..a976726f 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -1,48 +1,48 @@ -# Copyright 2024 NVIDIA Corporation. All rights reserved. -# -# Please refer to the NVIDIA end user license agreement (EULA) associated -# with this source code for terms and conditions that govern your use of -# this software. Any use, reproduction, disclosure, or distribution of -# this software and related documentation outside the terms of the EULA -# is strictly prohibited. - -import importlib - -import pytest - -from cuda.core.experimental._module import ObjectCode - - -@pytest.mark.skipif( - int(importlib.metadata.version("cuda-python").split(".")[0]) < 12, - reason="Module loading for older drivers validate require valid module code.", -) -def test_object_code_initialization(): - # Test with supported code types - for code_type in ["cubin", "ptx", "fatbin"]: - module_data = b"dummy_data" - obj_code = ObjectCode(module_data, code_type) - assert obj_code._code_type == code_type - assert obj_code._module == module_data - assert obj_code._handle is not None - - # Test with unsupported code type - with pytest.raises(ValueError): - ObjectCode(b"dummy_data", "unsupported_code_type") - - -# TODO add ObjectCode tests which provide the appropriate data for cuLibraryLoadFromFile -def test_object_code_initialization_with_str(): - assert True - - -def test_object_code_initialization_with_jit_options(): - assert True - - -def test_object_code_get_kernel(): - assert True - - -def test_kernel_from_obj(): - assert True +# Copyright 2024 NVIDIA Corporation. All rights reserved. +# +# Please refer to the NVIDIA end user license agreement (EULA) associated +# with this source code for terms and conditions that govern your use of +# this software. Any use, reproduction, disclosure, or distribution of +# this software and related documentation outside the terms of the EULA +# is strictly prohibited. + +import importlib + +import pytest + +from cuda.core.experimental._module import ObjectCode + + +@pytest.mark.skipif( + int(importlib.metadata.version("cuda-python").split(".")[0]) < 12, + reason="Module loading for older drivers validate require valid module code.", +) +def test_object_code_initialization(): + # Test with supported code types + for code_type in ["cubin", "ptx", "fatbin"]: + module_data = b"dummy_data" + obj_code = ObjectCode(module_data, code_type) + assert obj_code._code_type == code_type + assert obj_code._module == module_data + assert obj_code._handle is not None + + # Test with unsupported code type + with pytest.raises(ValueError): + ObjectCode(b"dummy_data", "unsupported_code_type") + + +# TODO add ObjectCode tests which provide the appropriate data for cuLibraryLoadFromFile +def test_object_code_initialization_with_str(): + assert True + + +def test_object_code_initialization_with_jit_options(): + assert True + + +def test_object_code_get_kernel(): + assert True + + +def test_kernel_from_obj(): + assert True diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index af94a7ba..95c4d377 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -1,66 +1,66 @@ -# Copyright 2024 NVIDIA Corporation. All rights reserved. -# -# Please refer to the NVIDIA end user license agreement (EULA) associated -# with this source code for terms and conditions that govern your use of -# this software. Any use, reproduction, disclosure, or distribution of -# this software and related documentation outside the terms of the EULA -# is strictly prohibited. - -import pytest - -from cuda.core.experimental import Program -from cuda.core.experimental._module import Kernel, ObjectCode - - -def test_program_init_valid_code_type(): - code = 'extern "C" __global__ void my_kernel() {}' - program = Program(code, "c++") - assert program.backend == "nvrtc" - assert program.handle is not None - - -def test_program_init_invalid_code_type(): - code = 'extern "C" __global__ void my_kernel() {}' - with pytest.raises(NotImplementedError): - Program(code, "python") - - -def test_program_init_invalid_code_format(): - code = 12345 - with pytest.raises(TypeError): - Program(code, "c++") - - -def test_program_compile_valid_target_type(): - code = 'extern "C" __global__ void my_kernel() {}' - program = Program(code, "c++") - object_code = program.compile("ptx") - kernel = object_code.get_kernel("my_kernel") - assert isinstance(object_code, ObjectCode) - assert isinstance(kernel, Kernel) - - -def test_program_compile_invalid_target_type(): - code = 'extern "C" __global__ void my_kernel() {}' - program = Program(code, "c++") - with pytest.raises(NotImplementedError): - program.compile("invalid_target") - - -def test_program_backend_property(): - code = 'extern "C" __global__ void my_kernel() {}' - program = Program(code, "c++") - assert program.backend == "nvrtc" - - -def test_program_handle_property(): - code = 'extern "C" __global__ void my_kernel() {}' - program = Program(code, "c++") - assert program.handle is not None - - -def test_program_close(): - code = 'extern "C" __global__ void my_kernel() {}' - program = Program(code, "c++") - program.close() - assert program.handle is None +# Copyright 2024 NVIDIA Corporation. All rights reserved. +# +# Please refer to the NVIDIA end user license agreement (EULA) associated +# with this source code for terms and conditions that govern your use of +# this software. Any use, reproduction, disclosure, or distribution of +# this software and related documentation outside the terms of the EULA +# is strictly prohibited. + +import pytest + +from cuda.core.experimental import Program +from cuda.core.experimental._module import Kernel, ObjectCode + + +def test_program_init_valid_code_type(): + code = 'extern "C" __global__ void my_kernel() {}' + program = Program(code, "c++") + assert program.backend == "nvrtc" + assert program.handle is not None + + +def test_program_init_invalid_code_type(): + code = 'extern "C" __global__ void my_kernel() {}' + with pytest.raises(NotImplementedError): + Program(code, "python") + + +def test_program_init_invalid_code_format(): + code = 12345 + with pytest.raises(TypeError): + Program(code, "c++") + + +def test_program_compile_valid_target_type(): + code = 'extern "C" __global__ void my_kernel() {}' + program = Program(code, "c++") + object_code = program.compile("ptx") + kernel = object_code.get_kernel("my_kernel") + assert isinstance(object_code, ObjectCode) + assert isinstance(kernel, Kernel) + + +def test_program_compile_invalid_target_type(): + code = 'extern "C" __global__ void my_kernel() {}' + program = Program(code, "c++") + with pytest.raises(NotImplementedError): + program.compile("invalid_target") + + +def test_program_backend_property(): + code = 'extern "C" __global__ void my_kernel() {}' + program = Program(code, "c++") + assert program.backend == "nvrtc" + + +def test_program_handle_property(): + code = 'extern "C" __global__ void my_kernel() {}' + program = Program(code, "c++") + assert program.handle is not None + + +def test_program_close(): + code = 'extern "C" __global__ void my_kernel() {}' + program = Program(code, "c++") + program.close() + assert program.handle is None diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py index 03cdd852..9c661192 100644 --- a/cuda_core/tests/test_stream.py +++ b/cuda_core/tests/test_stream.py @@ -1,115 +1,115 @@ -# Copyright 2024 NVIDIA Corporation. All rights reserved. -# -# Please refer to the NVIDIA end user license agreement (EULA) associated -# with this source code for terms and conditions that govern your use of -# this software. Any use, reproduction, disclosure, or distribution of -# this software and related documentation outside the terms of the EULA -# is strictly prohibited. - -import pytest - -from cuda.core.experimental import Device, Stream, StreamOptions -from cuda.core.experimental._event import Event -from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM, default_stream - - -def test_stream_init(): - with pytest.raises(NotImplementedError): - Stream() - - -def test_stream_init_with_options(init_cuda): - stream = Device().create_stream(options=StreamOptions(nonblocking=True, priority=0)) - assert stream.is_nonblocking is True - assert stream.priority == 0 - - -def test_stream_handle(init_cuda): - stream = Device().create_stream(options=StreamOptions()) - assert isinstance(stream.handle, int) - - -def test_stream_is_nonblocking(init_cuda): - stream = Device().create_stream(options=StreamOptions(nonblocking=True)) - assert stream.is_nonblocking is True - - -def test_stream_priority(init_cuda): - stream = Device().create_stream(options=StreamOptions(priority=0)) - assert stream.priority == 0 - stream = Device().create_stream(options=StreamOptions(priority=-1)) - assert stream.priority == -1 - with pytest.raises(ValueError): - stream = Device().create_stream(options=StreamOptions(priority=1)) - - -def test_stream_sync(init_cuda): - stream = Device().create_stream(options=StreamOptions()) - stream.sync() # Should not raise any exceptions - - -def test_stream_record(init_cuda): - stream = Device().create_stream(options=StreamOptions()) - event = stream.record() - assert isinstance(event, Event) - - -def test_stream_record_invalid_event(init_cuda): - stream = Device().create_stream(options=StreamOptions()) - with pytest.raises(TypeError): - stream.record(event="invalid_event") - - -def test_stream_wait_event(init_cuda): - s1 = Device().create_stream() - s2 = Device().create_stream() - e1 = s1.record() - s2.wait(e1) # Should not raise any exceptions - s2.sync() - - -def test_stream_wait_invalid_event(init_cuda): - stream = Device().create_stream(options=StreamOptions()) - with pytest.raises(ValueError): - stream.wait(event_or_stream="invalid_event") - - -def test_stream_device(init_cuda): - stream = Device().create_stream(options=StreamOptions()) - device = stream.device - assert isinstance(device, Device) - - -def test_stream_context(init_cuda): - stream = Device().create_stream(options=StreamOptions()) - context = stream.context - assert context is not None - - -def test_stream_from_foreign_stream(init_cuda): - device = Device() - other_stream = device.create_stream(options=StreamOptions()) - stream = device.create_stream(obj=other_stream) - assert other_stream.handle == stream.handle - device = stream.device - assert isinstance(device, Device) - context = stream.context - assert context is not None - - -def test_stream_from_handle(): - stream = Stream.from_handle(0) - assert isinstance(stream, Stream) - - -def test_legacy_default_stream(): - assert isinstance(LEGACY_DEFAULT_STREAM, Stream) - - -def test_per_thread_default_stream(): - assert isinstance(PER_THREAD_DEFAULT_STREAM, Stream) - - -def test_default_stream(): - stream = default_stream() - assert isinstance(stream, Stream) +# Copyright 2024 NVIDIA Corporation. All rights reserved. +# +# Please refer to the NVIDIA end user license agreement (EULA) associated +# with this source code for terms and conditions that govern your use of +# this software. Any use, reproduction, disclosure, or distribution of +# this software and related documentation outside the terms of the EULA +# is strictly prohibited. + +import pytest + +from cuda.core.experimental import Device, Stream, StreamOptions +from cuda.core.experimental._event import Event +from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM, default_stream + + +def test_stream_init(): + with pytest.raises(NotImplementedError): + Stream() + + +def test_stream_init_with_options(init_cuda): + stream = Device().create_stream(options=StreamOptions(nonblocking=True, priority=0)) + assert stream.is_nonblocking is True + assert stream.priority == 0 + + +def test_stream_handle(init_cuda): + stream = Device().create_stream(options=StreamOptions()) + assert isinstance(stream.handle, int) + + +def test_stream_is_nonblocking(init_cuda): + stream = Device().create_stream(options=StreamOptions(nonblocking=True)) + assert stream.is_nonblocking is True + + +def test_stream_priority(init_cuda): + stream = Device().create_stream(options=StreamOptions(priority=0)) + assert stream.priority == 0 + stream = Device().create_stream(options=StreamOptions(priority=-1)) + assert stream.priority == -1 + with pytest.raises(ValueError): + stream = Device().create_stream(options=StreamOptions(priority=1)) + + +def test_stream_sync(init_cuda): + stream = Device().create_stream(options=StreamOptions()) + stream.sync() # Should not raise any exceptions + + +def test_stream_record(init_cuda): + stream = Device().create_stream(options=StreamOptions()) + event = stream.record() + assert isinstance(event, Event) + + +def test_stream_record_invalid_event(init_cuda): + stream = Device().create_stream(options=StreamOptions()) + with pytest.raises(TypeError): + stream.record(event="invalid_event") + + +def test_stream_wait_event(init_cuda): + s1 = Device().create_stream() + s2 = Device().create_stream() + e1 = s1.record() + s2.wait(e1) # Should not raise any exceptions + s2.sync() + + +def test_stream_wait_invalid_event(init_cuda): + stream = Device().create_stream(options=StreamOptions()) + with pytest.raises(ValueError): + stream.wait(event_or_stream="invalid_event") + + +def test_stream_device(init_cuda): + stream = Device().create_stream(options=StreamOptions()) + device = stream.device + assert isinstance(device, Device) + + +def test_stream_context(init_cuda): + stream = Device().create_stream(options=StreamOptions()) + context = stream.context + assert context is not None + + +def test_stream_from_foreign_stream(init_cuda): + device = Device() + other_stream = device.create_stream(options=StreamOptions()) + stream = device.create_stream(obj=other_stream) + assert other_stream.handle == stream.handle + device = stream.device + assert isinstance(device, Device) + context = stream.context + assert context is not None + + +def test_stream_from_handle(): + stream = Stream.from_handle(0) + assert isinstance(stream, Stream) + + +def test_legacy_default_stream(): + assert isinstance(LEGACY_DEFAULT_STREAM, Stream) + + +def test_per_thread_default_stream(): + assert isinstance(PER_THREAD_DEFAULT_STREAM, Stream) + + +def test_default_stream(): + stream = default_stream() + assert isinstance(stream, Stream)