From ea098d9ec988e3465ddfdfee0ff49828a44f5d04 Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Wed, 30 Jun 2021 10:10:10 +0000
Subject: [PATCH 1/3] change ScatterAdd to EmbeddingDenseGrad in lookup_table
 NPU op

---
 paddle/fluid/operators/lookup_table_v2_op_npu.cc | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)
diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
index b4a861ed19c1b6..a3699e7bcfa6a1 100644
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -65,17 +65,13 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
+    int num_words = table_grad_t->dims()[0];
     const auto &runner_zeros =
-        NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t});
+        NpuOpRunner("EmbeddingDenseGrad", {*output_grad_t, *ids_t},
+                    {*table_grad_t}, {{"num_weights", num_words},
+                                      {"padding_idx", -1},
+                                      {"scale_grad_by_freq", false}});
     runner_zeros.Run(stream);
-
-    // NOTE(zhiqiu): It seems in cann 20.1, the first input and output
-    // can be different tensor, but in cann 20.2+, it does inplace operation.
-    // Thus, the first input and output should be same tensor.
-    const auto &runner_scatter =
-        NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t},
-                    {*table_grad_t}, {{"use_locking", true}});
-    runner_scatter.Run(stream);
   }
 };
 }  // namespace operators

From c82857be1a6256ba03e73bcf7ab13546a3705816 Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Wed, 7 Jul 2021 10:34:17 +0000
Subject: [PATCH 2/3] EmbeddingDenseGrad only supports dim 32

---
 .../fluid/operators/lookup_table_v2_op_npu.cc | 30 +++++++++++++----
 .../npu/test_lookup_table_v2_op_npu.py        | 32 +++++++++++++++++--
 2 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
index a3699e7bcfa6a1..f938281a4e6318 100644
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -65,13 +65,29 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    int num_words = table_grad_t->dims()[0];
-    const auto &runner_zeros =
-        NpuOpRunner("EmbeddingDenseGrad", {*output_grad_t, *ids_t},
-                    {*table_grad_t}, {{"num_weights", num_words},
-                                      {"padding_idx", -1},
-                                      {"scale_grad_by_freq", false}});
-    runner_zeros.Run(stream);
+    int embedding_dim = table_grad_t->dims()[-1];
+
+    if (embedding_dim % 32 == 0) {
+      int num_weights = table_grad_t->dims()[0];
+      const auto &runner =
+          NpuOpRunner("EmbeddingDenseGrad", {*output_grad_t, *ids_t},
+                      {*table_grad_t}, {{"num_weights", num_weights},
+                                        {"padding_idx", -1},
+                                        {"scale_grad_by_freq", false}});
+      runner.Run(stream);
+    } else {
+      const auto &runner_zeros =
+          NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t});
+      runner_zeros.Run(stream);
+
+      // NOTE(zhiqiu): It seems in cann 20.1, the first input and output
+      // can be different tensor, but in cann 20.2+, it does inplace operation.
+      // Thus, the first input and output should be same tensor.
+      const auto &runner_scatter =
+          NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t},
+                      {*table_grad_t}, {{"use_locking", true}});
+      runner_scatter.Run(stream);
+    }
   }
 };
 }  // namespace operators
diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
index 2463ddb7137acd..0f6ce09e17197a 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
@@ -35,14 +35,14 @@ def setUp(self):
         self.place = paddle.NPUPlace(0)
 
         self.init_dtype()
+        self.init_dim()
         np.random.seed(SEED)
         bsz = 6
         seqlen = 8
         vocab = 10
-        dim = 20
-        w = np.ones([vocab, dim]).astype(self.dtype)
+        w = np.ones([vocab, self.dim]).astype(self.dtype)
         x = np.random.randint(0, vocab, size=(bsz, seqlen)).astype(np.int32)
-        out = np.ones([bsz, seqlen, dim]).astype(self.dtype)
+        out = np.ones([bsz, seqlen, self.dim]).astype(self.dtype)
 
         self.inputs = {
             'W': OpTest.np_dtype_to_fluid_dtype(w),
@@ -62,6 +62,9 @@ def set_npu(self):
     def init_dtype(self):
         self.dtype = np.float32
 
+    def init_dim(self):
+        self.dim = 20
+
     def test_check_output(self):
         self.check_output_with_place(self.place, check_dygraph=False)
 
@@ -85,5 +88,28 @@ def set_npu(self):
         self.__class__.no_need_check_grad = True
 
 
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLookupTableV2Dim32(TestLookupTableV2):
+    def init_dim(self):
+        self.dim = 32
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLookupTableV2Dim32FP16(TestLookupTableV2):
+    no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def init_dim(self):
+        self.dim = 32
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+
 if __name__ == '__main__':
     unittest.main()

From 9fe4be0b01cf7b9cae187ab06894bc80dd7012ae Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Wed, 7 Jul 2021 11:41:27 +0000
Subject: [PATCH 3/3] fix shape error

---
 paddle/fluid/operators/lookup_table_v2_op_npu.cc            | 4 +++-
 .../tests/unittests/npu/test_lookup_table_v2_op_npu.py      | 6 ++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
index f938281a4e6318..61b5ec6794736b 100644
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -65,9 +65,11 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    int embedding_dim = table_grad_t->dims()[-1];
+    int embedding_dim = table_grad_t->dims()[1];
 
     if (embedding_dim % 32 == 0) {
+      // NOTE(pangyoki): The embedding_dim of Tensor used in
+      // EmbeddingDenseGrad must be an integer multiple of 32.
       int num_weights = table_grad_t->dims()[0];
       const auto &runner =
           NpuOpRunner("EmbeddingDenseGrad", {*output_grad_t, *ids_t},
diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
index 0f6ce09e17197a..41fe0636bd7790 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
@@ -63,6 +63,7 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def init_dim(self):
+        # embedding_dim is not multiple of 32
         self.dim = 20
 
     def test_check_output(self):
@@ -92,7 +93,8 @@ def set_npu(self):
                  "core is not compiled with NPU")
 class TestLookupTableV2Dim32(TestLookupTableV2):
     def init_dim(self):
-        self.dim = 32
+        # embedding_dim is multiple of 32
+        self.dim = 64
 
 
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
@@ -104,7 +106,7 @@ def init_dtype(self):
         self.dtype = np.float16
 
     def init_dim(self):
-        self.dim = 32
+        self.dim = 64
 
     def set_npu(self):
         self.__class__.use_npu = True