Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NPU] enable async copy and add wait before sync operation #31956

Merged
merged 6 commits into from
Apr 1, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 16 additions & 16 deletions paddle/fluid/framework/tensor_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,11 +160,15 @@ void TensorFromVector(const std::vector<T>& src,
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
// NOTE(zhiqiu): Becareful that aclrtMemcpyAsync is different from
// cudaMemcpyAsync.
// cudaMemcpyAsync is actually "sync" between cpu <-> gpu.
// aclrtMemcpyAsync is really "async" between cpu <-> npu.
// Since vector is on cpu, I think this function should be a "sync" operation,
// so pass nullptr as stream to memory::Copy().
else if (platform::is_npu_place(dst_place)) { // NOLINT
memory::Copy(
BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place,
src_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
src_place, src_ptr, size, nullptr);
}
#endif
}
Expand Down Expand Up @@ -203,10 +207,8 @@ inline void TensorFromVector(const std::vector<bool>& src,
#endif
#ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(dst_place)) { // NOLINT
memory::Copy(
BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place,
src_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
src_place, src_ptr, size, nullptr);
}
#endif
delete[] array;
Expand Down Expand Up @@ -266,10 +268,9 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
#endif
#ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(src.place())) { // NOLINT
memory::Copy(
dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()),
src_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
memory::Copy(dst_place, dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
size, nullptr);
}
#endif
}
Expand Down Expand Up @@ -302,10 +303,9 @@ inline void TensorToVector(const Tensor& src,
#endif
#ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(src.place())) { // NOLINT
memory::Copy(
dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()),
src_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
memory::Copy(dst_place, dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
size, nullptr);
}
#endif
for (unsigned int i = 0; i < src.numel(); i++) {
Expand Down
29 changes: 17 additions & 12 deletions paddle/fluid/memory/memcpy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -209,19 +209,19 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,

platform::SetNPUDeviceId(dst_place.device);

// NOTE(ascendrc): NPU memcpy async from host to device is a "real" async,
// which is different from CUDA. In Paddle, when async is called, "sync"
// is run actually, which means Paddle doesn't fully supported async.
// TODO(ascendrc): Support NPU memcpy async for better performance.
stream = nullptr;

VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by thream(" << stream << ")";

if (stream) {
platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU");
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream);
} else {
// On NPU, async operation after sync operation is ok, while sync operation
// after async is not ok, since the async operation may not done.
// So, its needed to do wait before sync operation.
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();

platform::RecordEvent record_event("NpuMemcpySync:CPU->NPU");
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
}
Expand All @@ -237,19 +237,16 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,

platform::SetNPUDeviceId(src_place.device);

// NOTE(ascendrc): NPU memcpy async from device to host is a "real" async,
// which is different from CUDA. In Paddle, when async is called, "sync"
// is run actually, which means Paddle doesn't fully supported async.
// TODO(ascendrc): Support NPU memcpy async for better performance.
stream = nullptr;

VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by thream(" << stream << ")";

if (stream) {
platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU");
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream);
} else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();

platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU");
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
}
Expand All @@ -272,6 +269,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
stream);
} else {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();

platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU");
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
}
Expand All @@ -286,6 +287,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
stream);
} else {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();

platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU");
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
}
Expand Down
6 changes: 0 additions & 6 deletions paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
TensorFromVector(init_y, ctx, tensor_y);
tensor_y->Resize({10, 10});

ctx.Wait();

auto place = ctx.GetPlace();
auto out = scope->Var("Out");
auto tensor_out = out->GetMutable<f::LoDTensor>();
Expand All @@ -74,7 +72,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
{{"Out", {"Out"}}}, attrs);

op->Run(*scope, place);
ctx.Wait();

std::vector<T> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
Expand Down Expand Up @@ -122,8 +119,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
TensorFromVector(init_dout, ctx, tensor_dout);
tensor_dout->Resize({2, 3, 5});

ctx.Wait();

// run
f::AttributeMap attrs;
auto op = f::OpRegistry::CreateOp(
Expand All @@ -132,7 +127,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,

auto place = ctx.GetPlace();
op->Run(*scope, place);
ctx.Wait();

std::vector<T> dx_vec;
TensorToVector(*tensor_dx, ctx, &dx_vec);
Expand Down
3 changes: 1 addition & 2 deletions paddle/fluid/operators/fill_constant_op_npu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,7 @@ class FillConstantNPUKernel : public framework::OpKernel<T> {

Tensor tensor_tmp(data_type);
tensor_tmp.mutable_data<T>({1}, ctx.GetPlace());
std::vector<T> init = {value};
TensorFromVector(init, ctx.device_context(), &tensor_tmp);
FillNpuTensorWithConstant<T>(&tensor_tmp, value);

out_var->mutable_data<T>(shape, place);
auto runner = NpuOpRunner("FillD", {tensor_tmp}, {*out_var},
Expand Down
7 changes: 5 additions & 2 deletions paddle/fluid/operators/npu_op_runner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,10 @@ aclFormat ConvertToNpuFormat(DataLayout layout) {
return iter->second;
}

aclrtStream GetCurrentNPUStream() {
int device_id = platform::GetCurrentNPUDeviceId();
aclrtStream GetCurrentNPUStream(int device_id) {
if (device_id == -1) {
device_id = platform::GetCurrentNPUDeviceId();
}
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *dev_ctx = static_cast<platform::NPUDeviceContext *>(
pool.Get(platform::NPUPlace(device_id)));
Expand Down Expand Up @@ -299,5 +301,6 @@ void NpuOpRunner::Run(aclrtStream stream) {
VLOG(4) << "after aclopCompileAndExecute: " << ret;
PADDLE_ENFORCE_NPU_SUCCESS(ret);
}

} // namespace operators
} // namespace paddle
38 changes: 38 additions & 0 deletions paddle/fluid/operators/npu_op_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,44 @@ class NpuOpRunner {

aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype);

aclrtStream GetCurrentNPUStream(int device_id = -1);

template <typename T>
void FillNpuTensorWithConstant(Tensor *tensor, T val) {
PADDLE_ENFORCE_EQ(
tensor->IsInitialized(), true,
platform::errors::InvalidArgument("The tensor should be initialized."));
PADDLE_ENFORCE_EQ(
platform::is_npu_place(tensor->place()), true,
platform::errors::InvalidArgument("The tensor should be on NPUPlace."));
// do async for better performance
if (typeid(float) == typeid(T) || typeid(platform::float16) == typeid(T)) {
Tensor tmp(tensor->type());
tmp.Resize(tensor->dims());
tmp.mutable_data<T>(tensor->place());
auto stream = GetCurrentNPUStream(
BOOST_GET_CONST(platform::NPUPlace, tensor->place()).device);
platform::NPUMemsetAsync(tmp.data<void>(), 0, tmp.numel() * sizeof(T),
stream);
auto runner = NpuOpRunner("Power", {tmp}, {*tensor},
{{"power", static_cast<float>(1)},
{"scale", static_cast<float>(0)},
{"shift", static_cast<float>(val)}});
runner.Run(stream);
} else {
T *array = new T[tensor->numel()];
for (unsigned int i = 0; i < tensor->numel(); ++i) {
array[i] = static_cast<T>(val);
}
std::vector<T> vec(tensor->numel(), static_cast<T>(val));
// do sync copy
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()),
tensor->data<void>(), platform::CPUPlace(), array,
tensor->numel() * sizeof(T), nullptr);
delete[] array;
}
}

} // namespace operators
} // namespace paddle
#endif
1 change: 1 addition & 0 deletions paddle/fluid/platform/device_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ NPUDeviceContext::~NPUDeviceContext() {

void NPUDeviceContext::Wait() const {
NPUDeviceGuard guard(place_.device);
VLOG(4) << "NPU context Wait";
PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice());
}

Expand Down