Make GPU loops support mutable lambda (pytorch#35015)

zasdfgbnm · facebook-github-bot · commit 39a101d06e85 · 2020-03-24T12:30:49.000-07:00
Summary: I will need it for pytorch#34004 The `mutable` qualifier allows a lambda to capture some values, and modify its own copy. This would be useful for random kernels: we capture a `state` of RNG, initialize it when it first run, and the initialized stated will be used later: ```C++ gpu_kernel(iter, [state, initialized](scalar_t arg) mutable -> scalar_t { if (!initialized) { curand_init(..., state); initialized = true; } return some_math(curand_uniform(state), arg); } ``` The `operator()` of `mutable` lambda is not `const`, so we can not pass it as constant reference. It can not be called inside a non-`mutable` lambda either. Example usage: ```C++ auto t = at::empty({4096}, kCUDA); float thread_work_index_ = 0; auto iter = TensorIterator::nullary_op(t); gpu_kernel(iter, [thread_work_index_]GPU_LAMBDA() mutable -> float { return thread_work_index_++; }); ``` Pull Request resolved: pytorch#35015 Differential Revision: D20624698 Pulled By: ngimel fbshipit-source-id: 06e3987793451cd514181d20252510297e2d28a9
diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh
@@ -121,42 +121,42 @@ static OffsetCalculator<N> make_offset_calculator(const TensorIterator& iter) {
 }
 
 template<int nt, int vt, typename func_t>
-static void launch_kernel(int64_t N, const func_t& f) {
+static void launch_kernel(int64_t N, func_t &&f) {
   TORCH_INTERNAL_ASSERT(N >= 0 && N <= std::numeric_limits<int32_t>::max());
   if (N == 0) {
     return;
   }
   dim3 block(nt);
   dim3 grid((N + block.x * vt - 1) / (block.x * vt));
   auto stream = at::cuda::getCurrentCUDAStream();
-  elementwise_kernel<nt, vt, func_t><<<grid, block, 0, stream>>>(N, f);
+  elementwise_kernel<nt, vt, func_t><<<grid, block, 0, stream>>>(N, std::move(f));
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
 template <typename traits, typename func_t, typename index_t, size_t... INDEX>
 C10_HOST_DEVICE typename traits::result_type
-invoke_impl(const func_t &f, char *const C10_RESTRICT data[], const index_t strides[], int i,
+invoke_impl(func_t &f, char *const C10_RESTRICT data[], const index_t strides[], int i,
             std::index_sequence<INDEX...>) {
   return f(*(typename traits::template arg<INDEX>::type*)(data[INDEX] + i * strides[INDEX])...);
 }
 
 template <typename func_t, typename index_t, typename traits = function_traits<func_t>>
 C10_HOST_DEVICE typename traits::result_type
-invoke(const func_t &f, char *const C10_RESTRICT data[], const index_t strides[], int i) {
+invoke(func_t &f, char *const C10_RESTRICT data[], const index_t strides[], int i) {
   using Indices = std::make_index_sequence<traits::arity>;
   return invoke_impl<traits>(f, data, strides, i, Indices{});
 }
 
 template <typename traits, typename func_t, typename index_t, size_t... I>
 C10_HOST_DEVICE typename traits::result_type
-invoke_impl(const func_t &f, char *const C10_RESTRICT data[], const index_t strides[], const ScalarType dtypes[], int i,
+invoke_impl(func_t &f, char *const C10_RESTRICT data[], const index_t strides[], const ScalarType dtypes[], int i,
             std::index_sequence<I...>) {
   return f(c10::fetch_and_cast<typename traits::template arg<I>::type>(dtypes[I], data[I] + i * strides[I])...);
 }
 
 template <typename func_t, typename index_t, typename traits = function_traits<func_t>>
 C10_HOST_DEVICE typename traits::result_type
-invoke(const func_t &f, char *const C10_RESTRICT data[], const index_t strides[], const ScalarType dtypes[], int i) {
+invoke(func_t &f, char *const C10_RESTRICT data[], const index_t strides[], const ScalarType dtypes[], int i) {
   using Indices = std::make_index_sequence<traits::arity>;
   return invoke_impl<traits>(f, data, strides, dtypes, i, Indices{});
 }
@@ -167,7 +167,7 @@ invoke(const func_t &f, char *const C10_RESTRICT data[], const index_t strides[]
 namespace modern {
 
 template<typename func_t, typename policy_t>
-__device__ inline void elementwise_kernel_helper(func_t f, policy_t policy) {
+__device__ inline void elementwise_kernel_helper(func_t &f, policy_t policy) {
   using traits = function_traits<func_t>;
   using return_t = typename traits::result_type;
   using args_t = typename traits::ArgsTuple;
@@ -218,7 +218,7 @@ __global__ void unrolled_elementwise_kernel(int N, func_t f, array_t data, inp_c
 
 // this function assume trivial 1d and no dynamic casting
 template<typename func_t, typename array_t>
-static inline void launch_vectorized_kernel(int64_t N, const func_t& f, array_t data) {
+static inline void launch_vectorized_kernel(int64_t N, func_t& f, array_t data) {
   TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
   using traits = function_traits<func_t>;
   int64_t grid = (N + block_work_size - 1) / block_work_size;
@@ -256,7 +256,7 @@ static inline void launch_unrolled_kernel(int64_t N, const func_t& f, array_t da
 
 
 template <typename func_t>
-void gpu_kernel_impl(TensorIterator& iter, const func_t& f) {
+void gpu_kernel_impl(TensorIterator& iter, func_t f) {
   using traits = function_traits<func_t>;
   using arg0_t = typename traits::result_type;
   constexpr int ntensors = traits::arity + 1;
@@ -300,28 +300,28 @@ void gpu_kernel_impl(TensorIterator& iter, const func_t& f) {
     }
 
     if (needs_dynamic_casting<func_t>::check(iter)) {
-      legacy::launch_kernel<launch_size_1d, 1>(numel, [=]GPU_LAMBDA(int idx) {
+      legacy::launch_kernel<launch_size_1d, 1>(numel, [=]GPU_LAMBDA(int idx) mutable {
         void* out = data[0] + strides[0] * idx;
         arg0_t result = legacy::invoke(f, &data.data[1], &strides.data[1], &dtypes.data[1], idx);
         c10::cast_and_store<arg0_t>(dtypes[0], out, result);
       });
     } else {
-      legacy::launch_kernel<launch_size_1d, 1>(numel, [=]GPU_LAMBDA(int idx) {
+      legacy::launch_kernel<launch_size_1d, 1>(numel, [=]GPU_LAMBDA(int idx) mutable {
         arg0_t* out = (arg0_t*)(data[0] + strides[0] * idx);
         *out = legacy::invoke(f, &data.data[1], &strides.data[1], idx);
       });
     }
   } else {
     auto offset_calc = legacy::make_offset_calculator<traits::arity + 1>(iter);
     if (needs_dynamic_casting<func_t>::check(iter)) {
-      legacy::launch_kernel<launch_size_nd, launch_bound2>(numel, [=]GPU_LAMBDA(int idx) {
+      legacy::launch_kernel<launch_size_nd, launch_bound2>(numel, [=]GPU_LAMBDA(int idx) mutable {
         auto offsets = offset_calc.get(idx);
         void* out = data[0] + offsets[0];
         arg0_t result = legacy::invoke(f, &data.data[1], &offsets.data[1], &dtypes.data[1], 1);
         c10::cast_and_store<arg0_t>(dtypes[0], out, result);
       });
     } else {
-      legacy::launch_kernel<launch_size_nd, launch_bound2>(numel, [=]GPU_LAMBDA(int idx) {
+      legacy::launch_kernel<launch_size_nd, launch_bound2>(numel, [=]GPU_LAMBDA(int idx) mutable {
         auto offsets = offset_calc.get(idx);
         arg0_t* out = (arg0_t*)(data[0] + offsets[0]);
         *out = legacy::invoke(f, &data.data[1], &offsets.data[1], 1);
diff --git a/aten/src/ATen/native/cuda/ROCmLoops.cuh b/aten/src/ATen/native/cuda/ROCmLoops.cuh
@@ -96,42 +96,42 @@ static OffsetCalculator<N> make_offset_calculator(const TensorIterator& iter) {
 }
 
 template<int nt, int vt, typename func_t>
-static void launch_kernel(int64_t N, const func_t& f) {
+static void launch_kernel(int64_t N, func_t&& f) {
   TORCH_INTERNAL_ASSERT(N >= 0 && N <= std::numeric_limits<int32_t>::max());
   if (N == 0) {
     return;
   }
   dim3 block(nt);
   dim3 grid((N + block.x * vt - 1) / (block.x * vt));
   auto stream = at::cuda::getCurrentCUDAStream();
-  elementwise_kernel<nt, vt, func_t><<<grid, block, 0, stream>>>(N, f);
+  elementwise_kernel<nt, vt, func_t><<<grid, block, 0, stream>>>(N, std::move(f));
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
 template <typename traits, typename func_t, typename index_t, size_t... INDEX>
 C10_HOST_DEVICE typename traits::result_type
-invoke_impl(const func_t &f, char *const C10_RESTRICT data[], const index_t strides[], int i,
+invoke_impl(func_t &f, char *const C10_RESTRICT data[], const index_t strides[], int i,
             std::index_sequence<INDEX...>) {
   return f(*(typename traits::template arg<INDEX>::type*)(data[INDEX] + i * strides[INDEX])...);
 }
 
 template <typename func_t, typename index_t, typename traits = function_traits<func_t>>
 C10_HOST_DEVICE typename traits::result_type
-invoke(const func_t &f, char *const C10_RESTRICT data[], const index_t strides[], int i) {
+invoke(func_t &f, char *const C10_RESTRICT data[], const index_t strides[], int i) {
   using Indices = std::make_index_sequence<traits::arity>;
   return invoke_impl<traits>(f, data, strides, i, Indices{});
 }
 
 template <typename traits, typename func_t, typename index_t, size_t... I>
 C10_HOST_DEVICE typename traits::result_type
-invoke_impl(const func_t &f, char *const C10_RESTRICT data[], const index_t strides[], const ScalarType dtypes[], int i,
+invoke_impl(func_t &f, char *const C10_RESTRICT data[], const index_t strides[], const ScalarType dtypes[], int i,
             std::index_sequence<I...>) {
   return f(c10::fetch_and_cast<typename traits::template arg<I>::type>(dtypes[I], data[I] + i * strides[I])...);
 }
 
 template <typename func_t, typename index_t, typename traits = function_traits<func_t>>
 C10_HOST_DEVICE typename traits::result_type
-invoke(const func_t &f, char *const C10_RESTRICT data[], const index_t strides[], const ScalarType dtypes[], int i) {
+invoke(func_t &f, char *const C10_RESTRICT data[], const index_t strides[], const ScalarType dtypes[], int i) {
   using Indices = std::make_index_sequence<traits::arity>;
   return invoke_impl<traits>(f, data, strides, dtypes, i, Indices{});
 }
@@ -259,7 +259,7 @@ __global__ void elementwise_kernel(int N, func_t f, array_t data) {
 
 // TODO (@zasdfgbnm): this function assume trivial 1d and no dynamic casting
 template<typename func_t, typename array_t, std::enable_if_t<detail::has_same_arg_types<func_t>::value, int> = 0>
-static void launch_kernel(int64_t N, const func_t& f, array_t data) {
+static void launch_kernel(int64_t N, func_t f, array_t data) {
   TORCH_INTERNAL_ASSERT(N >= 0 && N <= std::numeric_limits<int32_t>::max());
   if (N == 0) {
     return;
@@ -271,13 +271,13 @@ static void launch_kernel(int64_t N, const func_t& f, array_t data) {
 }
 
 template<typename func_t, typename array_t, std::enable_if_t<!detail::has_same_arg_types<func_t>::value, int> = 0>
-static void launch_kernel(int64_t N, const func_t& f, array_t data) {}
+static void launch_kernel(int64_t N, func_t f, array_t data) {}
 
 } // namespace modern
 
 
 template <typename func_t>
-void gpu_kernel_impl(TensorIterator& iter, const func_t& f) {
+void gpu_kernel_impl(TensorIterator& iter, func_t f) {
   using traits = function_traits<func_t>;
   using arg0_t = typename traits::result_type;
   constexpr int ntensors = traits::arity + 1;
@@ -304,30 +304,30 @@ void gpu_kernel_impl(TensorIterator& iter, const func_t& f) {
     }
 
     if (needs_dynamic_casting<func_t>::check(iter)) {
-      legacy::launch_kernel<launch_size_1d, 1>(numel, [=]GPU_LAMBDA(int idx) {
+      legacy::launch_kernel<launch_size_1d, 1>(numel, [=]GPU_LAMBDA(int idx) mutable {
         void* out = data[0] + strides[0] * idx;
         arg0_t result = legacy::invoke(f, &data.data[1], &strides.data[1], &dtypes.data[1], idx);
         c10::cast_and_store<arg0_t>(dtypes[0], out, result);
       });
     } else if (iter.has_contiguous_first_dim() && modern::detail::has_same_arg_types<func_t>::value) {
       modern::launch_kernel(numel, f, data);
     } else {
-      legacy::launch_kernel<launch_size_1d, 1>(numel, [=]GPU_LAMBDA(int idx) {
+      legacy::launch_kernel<launch_size_1d, 1>(numel, [=]GPU_LAMBDA(int idx) mutable {
         arg0_t* out = (arg0_t*)(data[0] + strides[0] * idx);
         *out = legacy::invoke(f, &data.data[1], &strides.data[1], idx);
       });
     }
   } else {
     auto offset_calc = legacy::make_offset_calculator<traits::arity + 1>(iter);
     if (needs_dynamic_casting<func_t>::check(iter)) {
-      legacy::launch_kernel<launch_size_nd, launch_bound2>(numel, [=]GPU_LAMBDA(int idx) {
+      legacy::launch_kernel<launch_size_nd, launch_bound2>(numel, [=]GPU_LAMBDA(int idx) mutable {
         auto offsets = offset_calc.get(idx);
         void* out = data[0] + offsets[0];
         arg0_t result = legacy::invoke(f, &data.data[1], &offsets.data[1], &dtypes.data[1], 1);
         c10::cast_and_store<arg0_t>(dtypes[0], out, result);
       });
     } else {
-      legacy::launch_kernel<launch_size_nd, launch_bound2>(numel, [=]GPU_LAMBDA(int idx) {
+      legacy::launch_kernel<launch_size_nd, launch_bound2>(numel, [=]GPU_LAMBDA(int idx) mutable {
         auto offsets = offset_calc.get(idx);
         arg0_t* out = (arg0_t*)(data[0] + offsets[0]);
         *out = legacy::invoke(f, &data.data[1], &offsets.data[1], 1);
diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
@@ -47,7 +47,7 @@ list(APPEND ATen_CUDA_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_optional_test.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_packedtensoraccessor_test.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_tensor_interop_test.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_vectorized_test.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_loops_test.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_generator_test.cu)
 if (CAFFE2_USE_CUDNN)
   list(APPEND ATen_CUDA_TEST_SRCS
diff --git a/aten/src/ATen/test/cuda_loops_test.cu b/aten/src/ATen/test/cuda_loops_test.cu
@@ -5,6 +5,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/core/Array.h>
 
+using namespace at;
 using namespace at::native;
 using namespace at::native::memory;
 
@@ -25,6 +26,23 @@ void reset_buffers() {
   }
 }
 
+Tensor thread_work_index() {
+  auto t = at::empty({4096 * thread_work_size}, kCUDA);
+  float thread_work_index_ = 0;
+  auto iter = TensorIterator::nullary_op(t);
+  gpu_kernel(iter, [thread_work_index_]GPU_LAMBDA() mutable -> float {
+    return thread_work_index_++;
+  });
+  return t;
+}
+
+TEST(TestLoops, MutableLambda) {
+  auto t = thread_work_index();
+  for (float i = 0; i < thread_work_size; i++) {
+    ASSERT_EQ((t == i).to(kLong).sum().item<int64_t>(), 4096);
+  }
+}
+
 #ifdef __HIP_PLATFORM_HCC__
 TEST(TestLoops, HasSameArgTypes) {
   // This is a compile-time unit test. If this file compiles without error,
diff --git a/aten/tools/run_tests.sh b/aten/tools/run_tests.sh
@@ -39,8 +39,8 @@ fi
 if [[ -x ./cuda_half_test ]]; then
   ./cuda_half_test
 fi
-if [[ -x ./cuda_vectorized_test ]]; then
-  ./cuda_vectorized_test
+if [[ -x ./cuda_loops_test ]]; then
+  ./cuda_loops_test
 fi
 if [[ -x ./cuda_distributions_test ]]; then
   ./cuda_distributions_test