Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NPU] support npu for memcpy op #31808

Merged
merged 4 commits into from
Mar 30, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions paddle/fluid/framework/op_registry.h
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)

#define REGISTER_OP_NPU_KERNEL_FUNCTOR(op_type, ...) \
REGISTER_OP_KERNEL_EX( \
op_type, NPU, ::paddle::platform::NPUPlace, DEFAULT_TYPE, \
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)

/**
* Macro to mark what Operator and Kernel
* we will use and tell the compiler to
Expand Down
20 changes: 15 additions & 5 deletions paddle/fluid/operators/memcpy_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -103,16 +103,18 @@ class MemcpyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
"is the same as input X.");
AddAttr<int>("dst_place_type",
"Determine the dst place of tensor copy. "
"By Now it ONLY support CUDAPlace and CUDAPinnedPlace. Other "
"place type is Unimplemented and will cause ERROR."
"By Now it ONLY support CUDAPlace <-> CUDAPinnedPlace or "
"NPUPlace <-> CPUPlace. "
"Other place type is Unimplemented and will cause ERROR."
"0: dst is on CPUPlace. "
"1: dst is on CUDAPlace. "
"2: dst is on CUDAPinnedPlace. "
"3: dst is on XPUPlace. ");
"3: dst is on XPUPlace. "
"4: dst is on NPUPlace. ");
AddComment(R"DOC(
Memcpy Operator.
By now, it ONLY supports the memcopy between CUDAPinnedPlace and CUDAPlace,
and used as an internal op by Recompute-Offload.
By now, it ONLY supports the memcopy between CUDAPinnedPlace <-> CUDAPlace or
NPUPlace <-> CPUPlace, and used as an internal op by Recompute-Offload.
You would have to update it if you want other more capacities.

Out = X, when type in [LoDTensor]
Expand Down Expand Up @@ -144,3 +146,11 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double,
ops::MemcpyKernel, plat::float16,
ops::MemcpyKernel);
#endif

#ifdef PADDLE_WITH_ASCEND_CL
REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double,
ops::MemcpyKernel, int, ops::MemcpyKernel,
int64_t, ops::MemcpyKernel, bool,
ops::MemcpyKernel, plat::float16,
ops::MemcpyKernel);
#endif
12 changes: 11 additions & 1 deletion paddle/fluid/operators/memcpy_op.h
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,17 @@ class MemcpyFunctor {
} else if (dst_place_type_ == 2) {
framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
&out_tensor);
} else {
}
#ifdef PADDLE_WITH_ASCEND_CL
else if (dst_place_type_ == 0) { // NOLINT
framework::TensorCopy(lod_tensor, platform::CPUPlace(), dev_ctx_,
&out_tensor);
} else if (dst_place_type_ == 4) {
framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
&out_tensor);
}
#endif
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"memcpy dst_place_type: %d is not supported yet.", dst_place_type_));
}
Expand Down
104 changes: 104 additions & 0 deletions python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import numpy as np
import unittest
import sys
sys.path.append("..")
from op_test import OpTest
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid import compiler, Program, program_guard

paddle.enable_static()
SEED = 2021


@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestMemcpy_FillConstant(unittest.TestCase):
def get_prog(self):
paddle.enable_static()
main_program = Program()
with program_guard(main_program):
cpu_var_name = "tensor@Cpu"
npu_var_name = "tensor@Npu"
cpu_var = main_program.global_block().create_var(
name=cpu_var_name,
shape=[10, 10],
dtype='float32',
persistable=False,
stop_gradient=True)
npu_var = main_program.global_block().create_var(
name=npu_var_name,
shape=[10, 10],
dtype='float32',
persistable=False,
stop_gradient=True)
main_program.global_block().append_op(
type="fill_constant",
outputs={"Out": npu_var_name},
attrs={
"shape": [10, 10],
"dtype": npu_var.dtype,
"value": 1.0,
"place_type": 1
})
main_program.global_block().append_op(
type="fill_constant",
outputs={"Out": cpu_var_name},
attrs={
"shape": [10, 10],
"dtype": cpu_var.dtype,
"value": 0.0,
"place_type": 2
})
return main_program, npu_var, cpu_var

def test_npu_cpoy_to_cpu(self):
main_program, npu_var, cpu_var = self.get_prog()
main_program.global_block().append_op(
type='memcpy',
inputs={'X': npu_var},
outputs={'Out': cpu_var},
attrs={'dst_place_type': 0})
place = fluid.NPUPlace(0)
exe = fluid.Executor(place)
npu_, cpu_ = exe.run(main_program,
feed={},
fetch_list=[npu_var.name, cpu_var.name])
self.assertTrue(np.allclose(npu_, cpu_))
self.assertTrue(np.allclose(cpu_, np.ones((10, 10))))

def test_cpu_cpoy_npu(self):
main_program, npu_var, cpu_var = self.get_prog()
main_program.global_block().append_op(
type='memcpy',
inputs={'X': cpu_var},
outputs={'Out': npu_var},
attrs={'dst_place_type': 4})
place = fluid.NPUPlace(0)
exe = fluid.Executor(place)
npu_, cpu_ = exe.run(main_program,
feed={},
fetch_list=[npu_var.name, cpu_var.name])
self.assertTrue(np.allclose(npu_, cpu_))
self.assertTrue(np.allclose(npu_, np.zeros((10, 10))))


if __name__ == '__main__':
unittest.main()