diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc index 778957cda403c..276bfa7b3281b 100644 --- a/paddle/fluid/operators/npu_op_runner.cc +++ b/paddle/fluid/operators/npu_op_runner.cc @@ -64,7 +64,7 @@ aclFormat ConvertToNpuFormat(DataLayout layout) { return iter->second; } -aclrtStream GetCurrentNPUStream(int device_id = -1) { +aclrtStream GetCurrentNPUStream(int device_id) { if (device_id == -1) { device_id = platform::GetCurrentNPUDeviceId(); } @@ -302,38 +302,5 @@ void NpuOpRunner::Run(aclrtStream stream) { PADDLE_ENFORCE_NPU_SUCCESS(ret); } -template -void FillNpuTensorWithConstant(Tensor *tensor, T val) { - PADDLE_ENFORCE_EQ( - tensor->IsInitialized(), true, - platform::errors::InvalidArgument("The tensor should be initialized.")); - PADDLE_ENFORCE_EQ( - platform::is_npu_place(tensor->place()), true, - platform::errors::InvalidArgument("The tensor should be on NPUPlace.")); - // do async for better performance - if (typeid(float) == typeid(T) || typeid(platform::float16) == typeid(T)) { - Tensor tmp(tensor->type()); - tmp.Resize(tensor->dims()); - tmp.mutable_data(place); - - platform::NPUMemsetAsync(tmp.data(), 0, tmp.numel() * sizeof(T), - GetCurrentNPUStream(tmp.device)); - NpuOpRunner("Power", {tmp}, {*tensor}, - {{"power", static_cast(1)}, - {"scale", static_cast(0)}, - {"shift", static_cast(val)}}); - } else { - T *array = new T[tensor->numel()]; - for (unsigned int i = 0; i < tensor->numel(); ++i) { - array[i] = static_cast(val); - } - std::vector vec(tensor->numel(), static_cast(val)); - // do sync copy - memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()), - tmp.data(), platform::CPUPlace(), array, size, nullptr); - delete[] array; - } -} - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h index 154480a2fdcc5..5506ddd89692b 100644 --- a/paddle/fluid/operators/npu_op_runner.h +++ b/paddle/fluid/operators/npu_op_runner.h @@ -86,8 +86,43 @@ class NpuOpRunner { aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype); +aclrtStream GetCurrentNPUStream(int device_id = -1); + template -void FillNpuTensorWithConstant(Tensor *tensor, T val); +void FillNpuTensorWithConstant(Tensor *tensor, T val) { + PADDLE_ENFORCE_EQ( + tensor->IsInitialized(), true, + platform::errors::InvalidArgument("The tensor should be initialized.")); + PADDLE_ENFORCE_EQ( + platform::is_npu_place(tensor->place()), true, + platform::errors::InvalidArgument("The tensor should be on NPUPlace.")); + // do async for better performance + if (typeid(float) == typeid(T) || typeid(platform::float16) == typeid(T)) { + Tensor tmp(tensor->type()); + tmp.Resize(tensor->dims()); + tmp.mutable_data(tensor->place()); + auto stream = GetCurrentNPUStream( + BOOST_GET_CONST(platform::NPUPlace, tensor->place()).device); + platform::NPUMemsetAsync(tmp.data(), 0, tmp.numel() * sizeof(T), + stream); + auto runner = NpuOpRunner("Power", {tmp}, {*tensor}, + {{"power", static_cast(1)}, + {"scale", static_cast(0)}, + {"shift", static_cast(val)}}); + runner.Run(stream); + } else { + T *array = new T[tensor->numel()]; + for (unsigned int i = 0; i < tensor->numel(); ++i) { + array[i] = static_cast(val); + } + std::vector vec(tensor->numel(), static_cast(val)); + // do sync copy + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()), + tensor->data(), platform::CPUPlace(), array, + tensor->numel() * sizeof(T), nullptr); + delete[] array; + } +} } // namespace operators } // namespace paddle