diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml index 322f859e3..0fd694ee5 100644 --- a/.github/workflows/test-wheel-linux.yml +++ b/.github/workflows/test-wheel-linux.yml @@ -180,6 +180,24 @@ jobs: host-platform: ${{ inputs.host-platform }} cuda-version: ${{ inputs.cuda-version }} + - name: Set up compute-sanitizer + run: | + # We don't test compute-sanitizer on CTK<12 because backporting fixes is too much effort + # We only test compute-sanitizer on python 3.12 arbitrarily; we don't need to use sanitizer on the entire matrix + # Only local ctk installs have compute-sanitizer; there is not wheel for it + if [[ "${{ inputs.python-version }}" == "3.12" && "${{ inputs.cuda-version }}" != "11.8.0" && "${{ inputs.local-ctk }}" == 1 ]]; then + COMPUTE_SANITIZER="${CUDA_HOME}/bin/compute-sanitizer" + COMPUTE_SANITIZER_VERSION=$(${COMPUTE_SANITIZER} --version | grep -Eo "[0-9]{4}\.[0-9]\.[0-9]" | sed -e 's/\.//g') + SANITIZER_CMD="${COMPUTE_SANITIZER} --target-processes=all --launch-timeout=0 --tool=memcheck --error-exitcode=1" + if [[ "$COMPUTE_SANITIZER_VERSION" -ge 202111 ]]; then + SANITIZER_CMD="${SANITIZER_CMD} --padding=32" + fi + echo "CUDA_PYTHON_TESTING_WITH_COMPUTE_SANITIZER=1" >> $GITHUB_ENV + else + SANITIZER_CMD="" + fi + echo "SANITIZER_CMD=${SANITIZER_CMD}" >> $GITHUB_ENV + - name: Run cuda.bindings tests if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0' }} run: | @@ -194,7 +212,7 @@ jobs: pushd ./cuda_bindings pip install -r requirements.txt - pytest -rxXs -v tests/ + ${SANITIZER_CMD} pytest -rxXs -v tests/ # It is a bit convoluted to run the Cython tests against CTK wheels, # so let's just skip them. @@ -202,10 +220,10 @@ jobs: if [[ "${{ inputs.host-platform }}" == linux* ]]; then bash tests/cython/build_tests.sh elif [[ "${{ inputs.host-platform }}" == win* ]]; then - # TODO: enable this once win-64 runners are up + # TODO: enable this once win-64 runners are up exit 1 - fi - pytest -rxXs -v tests/cython + fi + ${SANITIZER_CMD} pytest -rxXs -v tests/cython fi popd @@ -229,7 +247,7 @@ jobs: pushd ./cuda_core pip install -r "tests/requirements-cu${TEST_CUDA_MAJOR}.txt" - pytest -rxXs -v tests/ + ${SANITIZER_CMD} pytest -rxXs -v tests/ # It is a bit convoluted to run the Cython tests against CTK wheels, # so let's just skip them. Also, currently our CI always installs the @@ -243,7 +261,7 @@ jobs: # TODO: enable this once win-64 runners are up exit 1 fi - pytest -rxXs -v tests/cython + ${SANITIZER_CMD} pytest -rxXs -v tests/cython fi popd diff --git a/cuda_bindings/docs/source/environment_variables.md b/cuda_bindings/docs/source/environment_variables.md index 7329e582c..67b52b4dd 100644 --- a/cuda_bindings/docs/source/environment_variables.md +++ b/cuda_bindings/docs/source/environment_variables.md @@ -11,3 +11,8 @@ ## Runtime Environment Variables - `CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM` : When set to 1, the default stream is the per-thread default stream. When set to 0, the default stream is the legacy default stream. This defaults to 0, for the legacy default stream. See [Stream Synchronization Behavior](https://docs.nvidia.com/cuda/cuda-runtime-api/stream-sync-behavior.html) for an explanation of the legacy and per-thread default streams. + + +## Test-Time Environment Variables + +- `CUDA_PYTHON_TESTING_WITH_COMPUTE_SANITIZER` : When set to 1, tests are skipped that would cause [compute-sanitizer](https://docs.nvidia.com/compute-sanitizer/ComputeSanitizer/index.html) to raise an error. diff --git a/cuda_bindings/tests/conftest.py b/cuda_bindings/tests/conftest.py new file mode 100644 index 000000000..45767fb78 --- /dev/null +++ b/cuda_bindings/tests/conftest.py @@ -0,0 +1,8 @@ +import os + +import pytest + +skipif_testing_with_compute_sanitizer = pytest.mark.skipif( + os.environ.get("CUDA_PYTHON_TESTING_WITH_COMPUTE_SANITIZER", "0") == "1", + reason="The compute-sanitizer is running, and this test causes an API error.", +) diff --git a/cuda_bindings/tests/test_cuda.py b/cuda_bindings/tests/test_cuda.py index 3f28d55f6..612401f3e 100644 --- a/cuda_bindings/tests/test_cuda.py +++ b/cuda_bindings/tests/test_cuda.py @@ -11,6 +11,7 @@ import numpy as np import pytest +from conftest import skipif_testing_with_compute_sanitizer import cuda.cuda as cuda import cuda.cudart as cudart @@ -83,6 +84,7 @@ def test_cuda_memcpy(): assert err == cuda.CUresult.CUDA_SUCCESS +@skipif_testing_with_compute_sanitizer def test_cuda_array(): (err,) = cuda.cuInit(0) assert err == cuda.CUresult.CUDA_SUCCESS @@ -236,6 +238,7 @@ def test_cuda_uuid_list_access(): assert err == cuda.CUresult.CUDA_SUCCESS +@skipif_testing_with_compute_sanitizer def test_cuda_cuModuleLoadDataEx(): (err,) = cuda.cuInit(0) assert err == cuda.CUresult.CUDA_SUCCESS @@ -251,6 +254,7 @@ def test_cuda_cuModuleLoadDataEx(): cuda.CUjit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, cuda.CUjit_option.CU_JIT_LOG_VERBOSE, ] + # FIXME: This function call raises CUDA_ERROR_INVALID_VALUE err, mod = cuda.cuModuleLoadDataEx(0, 0, option_keys, []) (err,) = cuda.cuCtxDestroy(ctx) @@ -622,6 +626,7 @@ def test_cuda_coredump_attr(): assert err == cuda.CUresult.CUDA_SUCCESS +@skipif_testing_with_compute_sanitizer def test_get_error_name_and_string(): (err,) = cuda.cuInit(0) assert err == cuda.CUresult.CUDA_SUCCESS @@ -951,6 +956,7 @@ def test_CUmemDecompressParams_st(): assert int(desc.dstActBytes) == 0 +@skipif_testing_with_compute_sanitizer def test_all_CUresult_codes(): max_code = int(max(cuda.CUresult)) # Smoke test. CUDA_ERROR_UNKNOWN = 999, but intentionally using literal value. @@ -983,18 +989,21 @@ def test_all_CUresult_codes(): assert num_good >= 76 # CTK 11.0.3_450.51.06 +@skipif_testing_with_compute_sanitizer def test_cuKernelGetName_failure(): err, name = cuda.cuKernelGetName(0) assert err == cuda.CUresult.CUDA_ERROR_INVALID_VALUE assert name is None +@skipif_testing_with_compute_sanitizer def test_cuFuncGetName_failure(): err, name = cuda.cuFuncGetName(0) assert err == cuda.CUresult.CUDA_ERROR_INVALID_VALUE assert name is None +@skipif_testing_with_compute_sanitizer @pytest.mark.skipif( driverVersionLessThan(12080) or not supportsCudaAPI("cuCheckpointProcessGetState"), reason="When API was introduced", diff --git a/cuda_bindings/tests/test_cudart.py b/cuda_bindings/tests/test_cudart.py index 88f1b968a..f7eb1abb9 100644 --- a/cuda_bindings/tests/test_cudart.py +++ b/cuda_bindings/tests/test_cudart.py @@ -10,6 +10,7 @@ import numpy as np import pytest +from conftest import skipif_testing_with_compute_sanitizer import cuda.cuda as cuda import cuda.cudart as cudart @@ -70,6 +71,7 @@ def test_cudart_memcpy(): assertSuccess(err) +@skipif_testing_with_compute_sanitizer def test_cudart_hostRegister(): # Use hostRegister API to check for correct enum return values page_size = 80 diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index 889372417..23bb0274e 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -64,3 +64,9 @@ def clean_up_cffi_files(): os.remove(f) except FileNotFoundError: pass # noqa: SIM105 + + +skipif_testing_with_compute_sanitizer = pytest.mark.skipif( + os.environ.get("CUDA_PYTHON_TESTING_WITH_COMPUTE_SANITIZER", "0") == "1", + reason="The compute-sanitizer is running, and this test causes an API error.", +) diff --git a/cuda_core/tests/test_cuda_utils.py b/cuda_core/tests/test_cuda_utils.py index 5f94e545f..e96d904eb 100644 --- a/cuda_core/tests/test_cuda_utils.py +++ b/cuda_core/tests/test_cuda_utils.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE import pytest +from conftest import skipif_testing_with_compute_sanitizer from cuda.bindings import driver, runtime from cuda.core.experimental._utils import cuda_utils @@ -40,6 +41,8 @@ def test_runtime_cuda_error_explanations_health(): assert not extra_expl +# this test causes an API error when the driver is too old to know about all of the error codes +@skipif_testing_with_compute_sanitizer def test_check_driver_error(): num_unexpected = 0 for error in driver.CUresult: diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index 4895c0a67..f568ecdba 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -12,6 +12,7 @@ import numpy as np import pytest +from conftest import skipif_testing_with_compute_sanitizer import cuda.core.experimental from cuda.core.experimental import Device, EventOptions, LaunchConfig, Program, ProgramOptions, launch @@ -75,6 +76,7 @@ def test_is_done(init_cuda): assert event.is_done in (True, False) +@skipif_testing_with_compute_sanitizer def test_error_timing_disabled(): device = Device() device.set_current() @@ -97,6 +99,7 @@ def test_error_timing_disabled(): event2 - event1 +@skipif_testing_with_compute_sanitizer def test_error_timing_recorded(): device = Device() device.set_current() @@ -117,6 +120,7 @@ def test_error_timing_recorded(): # TODO: improve this once path finder can find headers +@skipif_testing_with_compute_sanitizer @pytest.mark.skipif(os.environ.get("CUDA_PATH") is None, reason="need libcu++ header") @pytest.mark.skipif(tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+") def test_error_timing_incomplete(): diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 78195c2dc..f15e98a42 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE import pytest +from conftest import skipif_testing_with_compute_sanitizer from cuda.core.experimental import Device, Linker, LinkerOptions, Program, ProgramOptions, _linker from cuda.core.experimental._module import ObjectCode @@ -140,6 +141,8 @@ def test_linker_link_invalid_target_type(compile_ptx_functions): linker.link("invalid_target") +# this test causes an API error when using the culink API +@skipif_testing_with_compute_sanitizer def test_linker_get_error_log(compile_ptx_functions): options = LinkerOptions(arch=ARCH)