Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NPU] Support npu save load #31893

Merged
merged 11 commits into from
Apr 9, 2021
Merged
51 changes: 45 additions & 6 deletions paddle/fluid/framework/tensor_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -822,6 +822,29 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
#else
PADDLE_THROW(platform::errors::Unimplemented(
"XPUPlace is not supported when not compiled with XPU"));
#endif
} else if (platform::is_npu_place(tensor.place())) {
#ifdef PADDLE_WITH_ASCEND_CL
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]);
auto& npu_dev_ctx =
static_cast<const platform::NPUDeviceContext&>(dev_ctx);
platform::CPUPlace cpu;
uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu, buf.get(),
BOOST_GET_CONST(platform::NPUPlace, tensor.place()),
reinterpret_cast<const void*>(data), size_to_write,
npu_dev_ctx.stream());
npu_dev_ctx.Wait();
os.write(buf.get(), size_to_write);
data += size_to_write;
size -= size_to_write;
}
#else
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported when not compiled with NPU"));
#endif
} else {
os.write(static_cast<const char*>(data_ptr),
Expand Down Expand Up @@ -877,8 +900,10 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
auto ctx = platform::CPUDeviceContext();
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace())) {
#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
platform::is_xpu_place(dev_ctx.GetPlace()) ||
platform::is_npu_place(dev_ctx.GetPlace())) {
#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU || \
defined PADDLE_WITH_ASCEND_CL
Tensor cpu_tensor;
cpu_tensor.Resize(framework::make_ddim(shape));
framework::VisitDataType(
Expand All @@ -887,13 +912,19 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
is.read(static_cast<char*>(buf), size);
auto dst_place = dev_ctx.GetPlace();
framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
if (platform::is_npu_place(dev_ctx.GetPlace())) {
dev_ctx.Wait();
}
#else
if (platform::is_gpu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented(
"CUDAPlace is not supported when not compiled with CUDA"));
} else {
} else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented(
"XPUPlace is not supported when not compiled with XPU"));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported when not compiled with NPU"));
}
#endif
} else {
Expand Down Expand Up @@ -934,8 +965,10 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
auto ctx = platform::CPUDeviceContext();
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace())) {
#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
platform::is_xpu_place(dev_ctx.GetPlace()) ||
platform::is_npu_place(dev_ctx.GetPlace())) {
#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU || \
defined PADDLE_WITH_ASCEND_CL
Tensor cpu_tensor;
cpu_tensor.Resize(framework::make_ddim(dims));
framework::VisitDataType(
Expand All @@ -944,13 +977,19 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
is.read(static_cast<char*>(buf), size);
auto dst_place = dev_ctx.GetPlace();
framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
if (platform::is_npu_place(dev_ctx.GetPlace())) {
dev_ctx.Wait();
}
#else
if (platform::is_gpu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented(
"CUDAPlace is not supported when not compiled with CUDA"));
} else {
} else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented(
"XPUPlace is not supported when not compiled with XPU"));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported when not compiled with NPU"));
}
#endif
} else {
Expand Down
27 changes: 27 additions & 0 deletions paddle/fluid/operators/load_combine_op_npu.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/operators/load_combine_op.h"

namespace ops = paddle::operators;

REGISTER_OP_NPU_KERNEL(
load_combine,
ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, float>,
ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, double>,
ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int>,
ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
#endif
26 changes: 26 additions & 0 deletions paddle/fluid/operators/load_op_npu.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2016->2021

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done


Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#ifdef PADDLE_WITH_ASCEND_CL
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can be removed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

#include "paddle/fluid/operators/load_op.h"

namespace ops = paddle::operators;

REGISTER_OP_NPU_KERNEL(
load, ops::LoadOpKernel<paddle::platform::NPUDeviceContext, float>,
ops::LoadOpKernel<paddle::platform::NPUDeviceContext, double>,
ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int>,
ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
#endif
26 changes: 26 additions & 0 deletions paddle/fluid/operators/save_combine_op_npu.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/operators/save_combine_op.h"

namespace ops = paddle::operators;

REGISTER_OP_NPU_KERNEL(
save_combine,
ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, float>,
ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, double>,
ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, int>,
ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
#endif
30 changes: 30 additions & 0 deletions paddle/fluid/operators/save_op_npu.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/operators/save_op.h"
#include "paddle/fluid/platform/float16.h"

namespace ops = paddle::operators;

REGISTER_OP_NPU_KERNEL(
save, ops::SaveOpKernel<paddle::platform::NPUDeviceContext, float>,
ops::SaveOpKernel<paddle::platform::NPUDeviceContext, double>,
ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int>,
ops::SaveOpKernel<paddle::platform::NPUDeviceContext, uint8_t>,
ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int64_t>,
ops::SaveOpKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
#endif
31 changes: 30 additions & 1 deletion paddle/fluid/pybind/tensor_py.h
Original file line number Diff line number Diff line change
Expand Up @@ -644,6 +644,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
}
bool is_gpu_tensor = platform::is_gpu_place(tensor.place());
bool is_xpu_tensor = platform::is_xpu_place(tensor.place());
bool is_npu_tensor = platform::is_npu_place(tensor.place());
const auto &tensor_dims = tensor.dims();
auto tensor_dtype = tensor.type();
size_t sizeof_dtype = framework::SizeOfType(tensor_dtype);
Expand All @@ -662,7 +663,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,

std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(tensor.type());

if (!is_gpu_tensor && !is_xpu_tensor) {
if (!is_gpu_tensor && !is_xpu_tensor && !is_npu_tensor) {
if (!need_deep_copy) {
auto base = py::cast(std::move(tensor));
return py::array(py::dtype(py_dtype_str.c_str()), py_dims, py_strides,
Expand Down Expand Up @@ -729,6 +730,34 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use CUDAPlace in CPU only version, "
"Please recompile or reinstall Paddle with CUDA support."));
#endif
} else if (is_npu_tensor) {
#ifdef PADDLE_WITH_ASCEND_CL
py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
platform::errors::InvalidArgument(
"PyArray is not writable, in which case memory leak "
"or double free would occur"));
PADDLE_ENFORCE_EQ(
py_arr.owndata(), true,
platform::errors::InvalidArgument(
"PyArray does not own data, in which case memory leak "
"or double free would occur"));

size_t copy_bytes = sizeof_dtype * numel;
auto p = BOOST_GET_CONST(platform::NPUPlace, tensor.place());
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &ctx = *pool.Get(tensor.place());
paddle::memory::Copy(
platform::CPUPlace(), py_arr.mutable_data(), p, tensor_buf_ptr,
copy_bytes,
reinterpret_cast<const platform::NPUDeviceContext &>(ctx).stream());
ctx.Wait();
return py_arr;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use NPUPlace in CPU/GPU/XPU version, "
"Please recompile or reinstall Paddle with NPU support."));
#endif
}
PADDLE_THROW(platform::errors::Unimplemented("Place is not supported"));
Expand Down
12 changes: 10 additions & 2 deletions python/paddle/fluid/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -1973,6 +1973,10 @@ def set_var(var, ndarray):
p = paddle.fluid.core.Place()
p.set_place(t._place())
place = paddle.fluid.XPUPlace(p.xpu_device_id())
elif p.is_npu_place():
p = paddle.fluid.core.Place()
p.set_place(t._place())
place = paddle.fluid.NPUPlace(p.npu_device_id())
else:
p = paddle.fluid.core.Place()
p.set_place(t._place())
Expand Down Expand Up @@ -2115,8 +2119,8 @@ def _load_vars_with_try_catch(exe,
error_str = "Failed to load model/variables `%s`, please make sure " \
"model/variables file is saved with the following APIs: " \
"save_params, save_persistables, save_vars."
filenames = [var.name for var in vars
] if filename is None else filename
filenames = [var.name for var in
vars] if filename is None else filename
if raise_error:
raise RuntimeError(error_str % filenames)
else:
Expand Down Expand Up @@ -2256,6 +2260,10 @@ def set_program_state(program, state_dict):
p = paddle.fluid.core.Place()
p.set_place(ten_place)
py_place = paddle.fluid.XPUPlace(p.xpu_device_id())
elif ten_place.is_npu_place():
p = paddle.fluid.core.Place()
p.set_place(ten_place)
py_place = paddle.fluid.NPUPlace(p.npu_device_id())

ten.set(new_para_np, py_place)

Expand Down
Loading