From ea098d9ec988e3465ddfdfee0ff49828a44f5d04 Mon Sep 17 00:00:00 2001 From: pangyoki Date: Wed, 30 Jun 2021 10:10:10 +0000 Subject: [PATCH 1/3] change ScatterAdd to EmbeddingDenseGrad in lookup_table NPU op --- paddle/fluid/operators/lookup_table_v2_op_npu.cc | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc index b4a861ed19c1b6..a3699e7bcfa6a1 100644 --- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc +++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc @@ -65,17 +65,13 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel { ctx.template device_context() .stream(); + int num_words = table_grad_t->dims()[0]; const auto &runner_zeros = - NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t}); + NpuOpRunner("EmbeddingDenseGrad", {*output_grad_t, *ids_t}, + {*table_grad_t}, {{"num_weights", num_words}, + {"padding_idx", -1}, + {"scale_grad_by_freq", false}}); runner_zeros.Run(stream); - - // NOTE(zhiqiu): It seems in cann 20.1, the first input and output - // can be different tensor, but in cann 20.2+, it does inplace operation. - // Thus, the first input and output should be same tensor. - const auto &runner_scatter = - NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t}, - {*table_grad_t}, {{"use_locking", true}}); - runner_scatter.Run(stream); } }; } // namespace operators From c82857be1a6256ba03e73bcf7ab13546a3705816 Mon Sep 17 00:00:00 2001 From: pangyoki Date: Wed, 7 Jul 2021 10:34:17 +0000 Subject: [PATCH 2/3] EmbeddingDenseGrad only supports dim 32 --- .../fluid/operators/lookup_table_v2_op_npu.cc | 30 +++++++++++++---- .../npu/test_lookup_table_v2_op_npu.py | 32 +++++++++++++++++-- 2 files changed, 52 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc index a3699e7bcfa6a1..f938281a4e6318 100644 --- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc +++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc @@ -65,13 +65,29 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel { ctx.template device_context() .stream(); - int num_words = table_grad_t->dims()[0]; - const auto &runner_zeros = - NpuOpRunner("EmbeddingDenseGrad", {*output_grad_t, *ids_t}, - {*table_grad_t}, {{"num_weights", num_words}, - {"padding_idx", -1}, - {"scale_grad_by_freq", false}}); - runner_zeros.Run(stream); + int embedding_dim = table_grad_t->dims()[-1]; + + if (embedding_dim % 32 == 0) { + int num_weights = table_grad_t->dims()[0]; + const auto &runner = + NpuOpRunner("EmbeddingDenseGrad", {*output_grad_t, *ids_t}, + {*table_grad_t}, {{"num_weights", num_weights}, + {"padding_idx", -1}, + {"scale_grad_by_freq", false}}); + runner.Run(stream); + } else { + const auto &runner_zeros = + NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t}); + runner_zeros.Run(stream); + + // NOTE(zhiqiu): It seems in cann 20.1, the first input and output + // can be different tensor, but in cann 20.2+, it does inplace operation. + // Thus, the first input and output should be same tensor. + const auto &runner_scatter = + NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t}, + {*table_grad_t}, {{"use_locking", true}}); + runner_scatter.Run(stream); + } } }; } // namespace operators diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py index 2463ddb7137acd..0f6ce09e17197a 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py @@ -35,14 +35,14 @@ def setUp(self): self.place = paddle.NPUPlace(0) self.init_dtype() + self.init_dim() np.random.seed(SEED) bsz = 6 seqlen = 8 vocab = 10 - dim = 20 - w = np.ones([vocab, dim]).astype(self.dtype) + w = np.ones([vocab, self.dim]).astype(self.dtype) x = np.random.randint(0, vocab, size=(bsz, seqlen)).astype(np.int32) - out = np.ones([bsz, seqlen, dim]).astype(self.dtype) + out = np.ones([bsz, seqlen, self.dim]).astype(self.dtype) self.inputs = { 'W': OpTest.np_dtype_to_fluid_dtype(w), @@ -62,6 +62,9 @@ def set_npu(self): def init_dtype(self): self.dtype = np.float32 + def init_dim(self): + self.dim = 20 + def test_check_output(self): self.check_output_with_place(self.place, check_dygraph=False) @@ -85,5 +88,28 @@ def set_npu(self): self.__class__.no_need_check_grad = True +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestLookupTableV2Dim32(TestLookupTableV2): + def init_dim(self): + self.dim = 32 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestLookupTableV2Dim32FP16(TestLookupTableV2): + no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def init_dim(self): + self.dim = 32 + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + if __name__ == '__main__': unittest.main() From 9fe4be0b01cf7b9cae187ab06894bc80dd7012ae Mon Sep 17 00:00:00 2001 From: pangyoki Date: Wed, 7 Jul 2021 11:41:27 +0000 Subject: [PATCH 3/3] fix shape error --- paddle/fluid/operators/lookup_table_v2_op_npu.cc | 4 +++- .../tests/unittests/npu/test_lookup_table_v2_op_npu.py | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc index f938281a4e6318..61b5ec6794736b 100644 --- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc +++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc @@ -65,9 +65,11 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel { ctx.template device_context() .stream(); - int embedding_dim = table_grad_t->dims()[-1]; + int embedding_dim = table_grad_t->dims()[1]; if (embedding_dim % 32 == 0) { + // NOTE(pangyoki): The embedding_dim of Tensor used in + // EmbeddingDenseGrad must be an integer multiple of 32. int num_weights = table_grad_t->dims()[0]; const auto &runner = NpuOpRunner("EmbeddingDenseGrad", {*output_grad_t, *ids_t}, diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py index 0f6ce09e17197a..41fe0636bd7790 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py @@ -63,6 +63,7 @@ def init_dtype(self): self.dtype = np.float32 def init_dim(self): + # embedding_dim is not multiple of 32 self.dim = 20 def test_check_output(self): @@ -92,7 +93,8 @@ def set_npu(self): "core is not compiled with NPU") class TestLookupTableV2Dim32(TestLookupTableV2): def init_dim(self): - self.dim = 32 + # embedding_dim is multiple of 32 + self.dim = 64 @unittest.skipIf(not paddle.is_compiled_with_npu(), @@ -104,7 +106,7 @@ def init_dtype(self): self.dtype = np.float16 def init_dim(self): - self.dim = 32 + self.dim = 64 def set_npu(self): self.__class__.use_npu = True