From 527df8ca8d5b6d953bf7ae7de613170515ae5b6e Mon Sep 17 00:00:00 2001 From: zhiqiu Date: Tue, 30 Mar 2021 07:04:13 +0000 Subject: [PATCH 1/6] enable async copy and add wait before sync operation --- paddle/fluid/memory/memcpy.cc | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index d616051ea6df5..c76d448013f04 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -209,12 +209,6 @@ void Copy(platform::NPUPlace dst_place, platform::SetNPUDeviceId(dst_place.device); - // NOTE(ascendrc): NPU memcpy async from host to device is a "real" async, - // which is different from CUDA. In Paddle, when async is called, "sync" - // is run actually, which means Paddle doesn't fully supported async. - // TODO(ascendrc): Support NPU memcpy async for better performance. - stream = nullptr; - VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << " by thream(" << stream << ")"; @@ -222,6 +216,12 @@ void Copy(platform::NPUPlace dst_place, platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU"); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream); } else { + // On NPU, async operation after sync operation is ok, while sync operation + // after async is not ok, since the async operation may not done. + // So, its needed to do wait before sync operation. + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + static_cast(pool.Get(dst_place))->Wait(); + platform::RecordEvent record_event("NpuMemcpySync:CPU->NPU"); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE); } @@ -237,12 +237,6 @@ void Copy(platform::CPUPlace dst_place, platform::SetNPUDeviceId(src_place.device); - // NOTE(ascendrc): NPU memcpy async from device to host is a "real" async, - // which is different from CUDA. In Paddle, when async is called, "sync" - // is run actually, which means Paddle doesn't fully supported async. - // TODO(ascendrc): Support NPU memcpy async for better performance. - stream = nullptr; - VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << " by thream(" << stream << ")"; @@ -250,6 +244,9 @@ void Copy(platform::CPUPlace dst_place, platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU"); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream); } else { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + static_cast(pool.Get(dst_place))->Wait(); + platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU"); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST); } @@ -272,6 +269,10 @@ void Copy(platform::NPUPlace dst_place, platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, stream); } else { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + static_cast(pool.Get(dst_place))->Wait(); + platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU"); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE); } @@ -286,6 +287,10 @@ void Copy(platform::NPUPlace dst_place, platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, stream); } else { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + static_cast(pool.Get(dst_place))->Wait(); + platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU"); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE); } From 9b13b5a917c9134882b8aa7dd1b0a1aff4906bbe Mon Sep 17 00:00:00 2001 From: zhiqiu Date: Tue, 30 Mar 2021 11:11:54 +0000 Subject: [PATCH 2/6] remove unneccessary wait --- .../fluid/operators/elementwise/elementwise_op_npu_test.cc | 6 ------ paddle/fluid/platform/device_context.cc | 1 + 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc index df6fae6c8484a..e6b5e5f8b7860 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc +++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc @@ -62,8 +62,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, TensorFromVector(init_y, ctx, tensor_y); tensor_y->Resize({10, 10}); - ctx.Wait(); - auto place = ctx.GetPlace(); auto out = scope->Var("Out"); auto tensor_out = out->GetMutable(); @@ -74,7 +72,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, {{"Out", {"Out"}}}, attrs); op->Run(*scope, place); - ctx.Wait(); std::vector out_vec; TensorToVector(*tensor_out, ctx, &out_vec); @@ -122,8 +119,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx, TensorFromVector(init_dout, ctx, tensor_dout); tensor_dout->Resize({2, 3, 5}); - ctx.Wait(); - // run f::AttributeMap attrs; auto op = f::OpRegistry::CreateOp( @@ -132,7 +127,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx, auto place = ctx.GetPlace(); op->Run(*scope, place); - ctx.Wait(); std::vector dx_vec; TensorToVector(*tensor_dx, ctx, &dx_vec); diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index e5031acb9b42e..000129852f097 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -254,6 +254,7 @@ NPUDeviceContext::~NPUDeviceContext() { void NPUDeviceContext::Wait() const { NPUDeviceGuard guard(place_.device); + VLOG(4) << "NPU context Wait"; PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice()); } From f4ec99145962931390f9147cddfc245323554e66 Mon Sep 17 00:00:00 2001 From: zhiqiu Date: Wed, 31 Mar 2021 07:15:06 +0000 Subject: [PATCH 3/6] add FillNpuTensorWithConstant --- paddle/fluid/operators/npu_op_runner.cc | 40 +++++++++++++++++++++++-- paddle/fluid/operators/npu_op_runner.h | 3 ++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc index aa0c4d2dfd274..778957cda403c 100644 --- a/paddle/fluid/operators/npu_op_runner.cc +++ b/paddle/fluid/operators/npu_op_runner.cc @@ -64,8 +64,10 @@ aclFormat ConvertToNpuFormat(DataLayout layout) { return iter->second; } -aclrtStream GetCurrentNPUStream() { - int device_id = platform::GetCurrentNPUDeviceId(); +aclrtStream GetCurrentNPUStream(int device_id = -1) { + if (device_id == -1) { + device_id = platform::GetCurrentNPUDeviceId(); + } platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = static_cast( pool.Get(platform::NPUPlace(device_id))); @@ -299,5 +301,39 @@ void NpuOpRunner::Run(aclrtStream stream) { VLOG(4) << "after aclopCompileAndExecute: " << ret; PADDLE_ENFORCE_NPU_SUCCESS(ret); } + +template +void FillNpuTensorWithConstant(Tensor *tensor, T val) { + PADDLE_ENFORCE_EQ( + tensor->IsInitialized(), true, + platform::errors::InvalidArgument("The tensor should be initialized.")); + PADDLE_ENFORCE_EQ( + platform::is_npu_place(tensor->place()), true, + platform::errors::InvalidArgument("The tensor should be on NPUPlace.")); + // do async for better performance + if (typeid(float) == typeid(T) || typeid(platform::float16) == typeid(T)) { + Tensor tmp(tensor->type()); + tmp.Resize(tensor->dims()); + tmp.mutable_data(place); + + platform::NPUMemsetAsync(tmp.data(), 0, tmp.numel() * sizeof(T), + GetCurrentNPUStream(tmp.device)); + NpuOpRunner("Power", {tmp}, {*tensor}, + {{"power", static_cast(1)}, + {"scale", static_cast(0)}, + {"shift", static_cast(val)}}); + } else { + T *array = new T[tensor->numel()]; + for (unsigned int i = 0; i < tensor->numel(); ++i) { + array[i] = static_cast(val); + } + std::vector vec(tensor->numel(), static_cast(val)); + // do sync copy + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()), + tmp.data(), platform::CPUPlace(), array, size, nullptr); + delete[] array; + } +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h index e178f7fc6e96d..154480a2fdcc5 100644 --- a/paddle/fluid/operators/npu_op_runner.h +++ b/paddle/fluid/operators/npu_op_runner.h @@ -86,6 +86,9 @@ class NpuOpRunner { aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype); +template +void FillNpuTensorWithConstant(Tensor *tensor, T val); + } // namespace operators } // namespace paddle #endif From e5d847ad51e92f2ad6342cf7fa03f4f721925206 Mon Sep 17 00:00:00 2001 From: zhiqiu Date: Wed, 31 Mar 2021 08:00:23 +0000 Subject: [PATCH 4/6] refine --- paddle/fluid/operators/npu_op_runner.cc | 35 +---------------------- paddle/fluid/operators/npu_op_runner.h | 37 ++++++++++++++++++++++++- 2 files changed, 37 insertions(+), 35 deletions(-) diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc index 778957cda403c..276bfa7b3281b 100644 --- a/paddle/fluid/operators/npu_op_runner.cc +++ b/paddle/fluid/operators/npu_op_runner.cc @@ -64,7 +64,7 @@ aclFormat ConvertToNpuFormat(DataLayout layout) { return iter->second; } -aclrtStream GetCurrentNPUStream(int device_id = -1) { +aclrtStream GetCurrentNPUStream(int device_id) { if (device_id == -1) { device_id = platform::GetCurrentNPUDeviceId(); } @@ -302,38 +302,5 @@ void NpuOpRunner::Run(aclrtStream stream) { PADDLE_ENFORCE_NPU_SUCCESS(ret); } -template -void FillNpuTensorWithConstant(Tensor *tensor, T val) { - PADDLE_ENFORCE_EQ( - tensor->IsInitialized(), true, - platform::errors::InvalidArgument("The tensor should be initialized.")); - PADDLE_ENFORCE_EQ( - platform::is_npu_place(tensor->place()), true, - platform::errors::InvalidArgument("The tensor should be on NPUPlace.")); - // do async for better performance - if (typeid(float) == typeid(T) || typeid(platform::float16) == typeid(T)) { - Tensor tmp(tensor->type()); - tmp.Resize(tensor->dims()); - tmp.mutable_data(place); - - platform::NPUMemsetAsync(tmp.data(), 0, tmp.numel() * sizeof(T), - GetCurrentNPUStream(tmp.device)); - NpuOpRunner("Power", {tmp}, {*tensor}, - {{"power", static_cast(1)}, - {"scale", static_cast(0)}, - {"shift", static_cast(val)}}); - } else { - T *array = new T[tensor->numel()]; - for (unsigned int i = 0; i < tensor->numel(); ++i) { - array[i] = static_cast(val); - } - std::vector vec(tensor->numel(), static_cast(val)); - // do sync copy - memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()), - tmp.data(), platform::CPUPlace(), array, size, nullptr); - delete[] array; - } -} - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h index 154480a2fdcc5..5506ddd89692b 100644 --- a/paddle/fluid/operators/npu_op_runner.h +++ b/paddle/fluid/operators/npu_op_runner.h @@ -86,8 +86,43 @@ class NpuOpRunner { aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype); +aclrtStream GetCurrentNPUStream(int device_id = -1); + template -void FillNpuTensorWithConstant(Tensor *tensor, T val); +void FillNpuTensorWithConstant(Tensor *tensor, T val) { + PADDLE_ENFORCE_EQ( + tensor->IsInitialized(), true, + platform::errors::InvalidArgument("The tensor should be initialized.")); + PADDLE_ENFORCE_EQ( + platform::is_npu_place(tensor->place()), true, + platform::errors::InvalidArgument("The tensor should be on NPUPlace.")); + // do async for better performance + if (typeid(float) == typeid(T) || typeid(platform::float16) == typeid(T)) { + Tensor tmp(tensor->type()); + tmp.Resize(tensor->dims()); + tmp.mutable_data(tensor->place()); + auto stream = GetCurrentNPUStream( + BOOST_GET_CONST(platform::NPUPlace, tensor->place()).device); + platform::NPUMemsetAsync(tmp.data(), 0, tmp.numel() * sizeof(T), + stream); + auto runner = NpuOpRunner("Power", {tmp}, {*tensor}, + {{"power", static_cast(1)}, + {"scale", static_cast(0)}, + {"shift", static_cast(val)}}); + runner.Run(stream); + } else { + T *array = new T[tensor->numel()]; + for (unsigned int i = 0; i < tensor->numel(); ++i) { + array[i] = static_cast(val); + } + std::vector vec(tensor->numel(), static_cast(val)); + // do sync copy + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()), + tensor->data(), platform::CPUPlace(), array, + tensor->numel() * sizeof(T), nullptr); + delete[] array; + } +} } // namespace operators } // namespace paddle From 82e0845f77bed066b17d930022c3e22661f588a2 Mon Sep 17 00:00:00 2001 From: zhiqiu Date: Wed, 31 Mar 2021 08:10:57 +0000 Subject: [PATCH 5/6] fix fill_constant --- paddle/fluid/operators/fill_constant_op_npu.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc index eeacd09a9c5c5..4ea4c11c47835 100644 --- a/paddle/fluid/operators/fill_constant_op_npu.cc +++ b/paddle/fluid/operators/fill_constant_op_npu.cc @@ -65,8 +65,7 @@ class FillConstantNPUKernel : public framework::OpKernel { Tensor tensor_tmp(data_type); tensor_tmp.mutable_data({1}, ctx.GetPlace()); - std::vector init = {value}; - TensorFromVector(init, ctx.device_context(), &tensor_tmp); + FillNpuTensorWithConstant(&tensor_tmp, value); out_var->mutable_data(shape, place); auto runner = NpuOpRunner("FillD", {tensor_tmp}, {*out_var}, From 7c410bedff1b6f69d8ac811082ef9265edc4f9b2 Mon Sep 17 00:00:00 2001 From: zhiqiu Date: Thu, 1 Apr 2021 08:06:32 +0000 Subject: [PATCH 6/6] make TensorFromVector/TensorToVector sync --- paddle/fluid/framework/tensor_util.h | 32 ++++++++++++++-------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 1df4cf2638c30..cab4744ed0dfb 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -160,11 +160,15 @@ void TensorFromVector(const std::vector& src, } #endif #ifdef PADDLE_WITH_ASCEND_CL + // NOTE(zhiqiu): Becareful that aclrtMemcpyAsync is different from + // cudaMemcpyAsync. + // cudaMemcpyAsync is actually "sync" between cpu <-> gpu. + // aclrtMemcpyAsync is really "async" between cpu <-> npu. + // Since vector is on cpu, I think this function should be a "sync" operation, + // so pass nullptr as stream to memory::Copy(). else if (platform::is_npu_place(dst_place)) { // NOLINT - memory::Copy( - BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place, - src_ptr, size, - reinterpret_cast(ctx).stream()); + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + src_place, src_ptr, size, nullptr); } #endif } @@ -203,10 +207,8 @@ inline void TensorFromVector(const std::vector& src, #endif #ifdef PADDLE_WITH_ASCEND_CL else if (platform::is_npu_place(dst_place)) { // NOLINT - memory::Copy( - BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place, - src_ptr, size, - reinterpret_cast(ctx).stream()); + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + src_place, src_ptr, size, nullptr); } #endif delete[] array; @@ -266,10 +268,9 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, #endif #ifdef PADDLE_WITH_ASCEND_CL else if (platform::is_npu_place(src.place())) { // NOLINT - memory::Copy( - dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()), - src_ptr, size, - reinterpret_cast(ctx).stream()); + memory::Copy(dst_place, dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr, + size, nullptr); } #endif } @@ -302,10 +303,9 @@ inline void TensorToVector(const Tensor& src, #endif #ifdef PADDLE_WITH_ASCEND_CL else if (platform::is_npu_place(src.place())) { // NOLINT - memory::Copy( - dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()), - src_ptr, size, - reinterpret_cast(ctx).stream()); + memory::Copy(dst_place, dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr, + size, nullptr); } #endif for (unsigned int i = 0; i < src.numel(); i++) {