diff --git a/aten/src/ATen/native/cuda/Nonzero.cu b/aten/src/ATen/native/cuda/Nonzero.cu index e87f46cd844eab..e5fb9230de7637 100644 --- a/aten/src/ATen/native/cuda/Nonzero.cu +++ b/aten/src/ATen/native/cuda/Nonzero.cu @@ -1,6 +1,7 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include +#include #include #include #include @@ -70,7 +71,16 @@ void nonzero_cuda_out_impl(const Tensor& self, Tensor& out){ auto temp_storage = allocator.allocate(temp_storage_bytes); cub::DeviceReduce::Sum(temp_storage.get(), temp_storage_bytes, itr, (int*)num_nonzeros.get(), N, stream); int num_nonzeros_h; - at::cuda::memcpy_and_sync(&num_nonzeros_h, num_nonzeros.get(), sizeof(int), cudaMemcpyDeviceToHost, stream); + auto pinned_num_nonzeros_h = at::detail::empty_cpu( + {1}, /* size */ + c10::CppTypeToScalarType(), /* dtype */ + std::nullopt, /* layout */ + std::nullopt, /* device */ + true, /* pin_memory */ + std::nullopt /* memory format */ + ); + at::cuda::memcpy_and_sync((void *)pinned_num_nonzeros_h.const_data_ptr(), num_nonzeros.get(), sizeof(int), cudaMemcpyDeviceToHost, stream); + num_nonzeros_h = (int)*(pinned_num_nonzeros_h.const_data_ptr()); //expected output size is num_nonzeros x ndim //we are producing output with size {num_nonzeros, ndim} and strides {1, num_nonzeros} (that is, transposed ndim x num_nonzeros output) //we are able to directly use passed output with this size and strides, and we can also (per contract)