Added cudaGetLastError() calls to reset benchmarking kernel errors (i…

…ssue 88). (#173) * Create and use NVBENCH_CUDA_CALL_RESET_ERROR. * Moved cudaGetLastError() call to NVBENCH_CUDA_CALL macro --------- Co-authored-by: Sergey Pavlov <psvvsp89@gmail.com>
NVIDIA · May 31, 2024 · a171514 · a171514
1 parent 088c9ee
commit a171514
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 0 deletions.
diff --git a/nvbench/cuda_call.cuh b/nvbench/cuda_call.cuh
@@ -24,12 +24,14 @@
 #include <string>
 
 /// Throws a std::runtime_error if `call` doesn't return `cudaSuccess`.
+/// Resets the error with cudaGetLastError().
 #define NVBENCH_CUDA_CALL(call)                                                                    \
   do                                                                                               \
   {                                                                                                \
     const cudaError_t nvbench_cuda_call_error = call;                                              \
     if (nvbench_cuda_call_error != cudaSuccess)                                                    \
     {                                                                                              \
+      cudaGetLastError();                                                                          \
       nvbench::cuda_call::throw_error(__FILE__, __LINE__, #call, nvbench_cuda_call_error);         \
     }                                                                                              \
   } while (false)

diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt
@@ -16,6 +16,7 @@ set(test_srcs
   named_values.cu
   option_parser.cu
   range.cu
+  reset_error.cu
   ring_buffer.cu
   runner.cu
   state.cu

diff --git a/testing/reset_error.cu b/testing/reset_error.cu
@@ -0,0 +1,30 @@
+#include <nvbench/cuda_call.cuh>
+
+#include "test_asserts.cuh"
+
+
+namespace
+{
+    __global__ void multiply5(const int32_t* __restrict__ a, int32_t* __restrict__ b)
+    {
+      const auto id = blockIdx.x * blockDim.x + threadIdx.x;
+      b[id] = 5 * a[id];
+    }
+}
+
+int main()
+{ 
+  multiply5<<<256, 256>>>(nullptr, nullptr);
+
+  try
+  {
+    NVBENCH_CUDA_CALL(cudaStreamSynchronize(0));
+    ASSERT(false);
+  }
+  catch (const std::runtime_error &)
+  {
+    ASSERT(cudaGetLastError() == cudaError_t::cudaSuccess);
+  }
+
+  return 0;
+}