diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 6975dd7a214da..b002d6a0019ba 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -343,6 +343,12 @@ struct OpKernelRegistrarFunctorEx("dst_place_type", "Determine the dst place of tensor copy. " - "By Now it ONLY support CUDAPlace and CUDAPinnedPlace. Other " - "place type is Unimplemented and will cause ERROR." + "By Now it ONLY support CUDAPlace <-> CUDAPinnedPlace or " + "NPUPlace <-> CPUPlace. " + "Other place type is Unimplemented and will cause ERROR." "0: dst is on CPUPlace. " "1: dst is on CUDAPlace. " "2: dst is on CUDAPinnedPlace. " - "3: dst is on XPUPlace. "); + "3: dst is on XPUPlace. " + "4: dst is on NPUPlace. "); AddComment(R"DOC( Memcpy Operator. - By now, it ONLY supports the memcopy between CUDAPinnedPlace and CUDAPlace, - and used as an internal op by Recompute-Offload. + By now, it ONLY supports the memcopy between CUDAPinnedPlace <-> CUDAPlace or + NPUPlace <-> CPUPlace, and used as an internal op by Recompute-Offload. You would have to update it if you want other more capacities. Out = X, when type in [LoDTensor] @@ -144,3 +146,11 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double, ops::MemcpyKernel, plat::float16, ops::MemcpyKernel); #endif + +#ifdef PADDLE_WITH_ASCEND_CL +REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double, + ops::MemcpyKernel, int, ops::MemcpyKernel, + int64_t, ops::MemcpyKernel, bool, + ops::MemcpyKernel, plat::float16, + ops::MemcpyKernel); +#endif diff --git a/paddle/fluid/operators/memcpy_op.h b/paddle/fluid/operators/memcpy_op.h old mode 100755 new mode 100644 index ac190312653b7..a0b07d251ac47 --- a/paddle/fluid/operators/memcpy_op.h +++ b/paddle/fluid/operators/memcpy_op.h @@ -44,7 +44,17 @@ class MemcpyFunctor { } else if (dst_place_type_ == 2) { framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, &out_tensor); - } else { + } +#ifdef PADDLE_WITH_ASCEND_CL + else if (dst_place_type_ == 0) { // NOLINT + framework::TensorCopy(lod_tensor, platform::CPUPlace(), dev_ctx_, + &out_tensor); + } else if (dst_place_type_ == 4) { + framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, + &out_tensor); + } +#endif + else { // NOLINT PADDLE_THROW(platform::errors::Unimplemented( "memcpy dst_place_type: %d is not supported yet.", dst_place_type_)); } diff --git a/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py new file mode 100755 index 0000000000000..63c4fb8e5885e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py @@ -0,0 +1,104 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid import compiler, Program, program_guard + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestMemcpy_FillConstant(unittest.TestCase): + def get_prog(self): + paddle.enable_static() + main_program = Program() + with program_guard(main_program): + cpu_var_name = "tensor@Cpu" + npu_var_name = "tensor@Npu" + cpu_var = main_program.global_block().create_var( + name=cpu_var_name, + shape=[10, 10], + dtype='float32', + persistable=False, + stop_gradient=True) + npu_var = main_program.global_block().create_var( + name=npu_var_name, + shape=[10, 10], + dtype='float32', + persistable=False, + stop_gradient=True) + main_program.global_block().append_op( + type="fill_constant", + outputs={"Out": npu_var_name}, + attrs={ + "shape": [10, 10], + "dtype": npu_var.dtype, + "value": 1.0, + "place_type": 1 + }) + main_program.global_block().append_op( + type="fill_constant", + outputs={"Out": cpu_var_name}, + attrs={ + "shape": [10, 10], + "dtype": cpu_var.dtype, + "value": 0.0, + "place_type": 2 + }) + return main_program, npu_var, cpu_var + + def test_npu_cpoy_to_cpu(self): + main_program, npu_var, cpu_var = self.get_prog() + main_program.global_block().append_op( + type='memcpy', + inputs={'X': npu_var}, + outputs={'Out': cpu_var}, + attrs={'dst_place_type': 0}) + place = fluid.NPUPlace(0) + exe = fluid.Executor(place) + npu_, cpu_ = exe.run(main_program, + feed={}, + fetch_list=[npu_var.name, cpu_var.name]) + self.assertTrue(np.allclose(npu_, cpu_)) + self.assertTrue(np.allclose(cpu_, np.ones((10, 10)))) + + def test_cpu_cpoy_npu(self): + main_program, npu_var, cpu_var = self.get_prog() + main_program.global_block().append_op( + type='memcpy', + inputs={'X': cpu_var}, + outputs={'Out': npu_var}, + attrs={'dst_place_type': 4}) + place = fluid.NPUPlace(0) + exe = fluid.Executor(place) + npu_, cpu_ = exe.run(main_program, + feed={}, + fetch_list=[npu_var.name, cpu_var.name]) + self.assertTrue(np.allclose(npu_, cpu_)) + self.assertTrue(np.allclose(npu_, np.zeros((10, 10)))) + + +if __name__ == '__main__': + unittest.main()