From 2d91a9bd80fa4b58a1d16f1d60f1323c542b49f4 Mon Sep 17 00:00:00 2001
From: Difer <707065510@qq.com>
Date: Fri, 4 Aug 2023 20:41:56 +0800
Subject: [PATCH] repacle embedding in fluid with 2.0 version (#55757)

* replace embedding

* replace sparse_embedding

* fix some bugs

* del embedding

* repalce layers.embedding

* fix type error
---
 python/paddle/fluid/layers/nn.py              | 166 ------------------
 .../distribute_transpiler/__init__.py         |   2 +-
 .../fleet/parameter_server/pslib/__init__.py  |   5 +-
 test/auto_parallel/test_dist_embedding.py     |  25 ++-
 test/book/notest_understand_sentiment.py      |   2 +-
 test/book/test_recommender_system.py          |  20 +--
 test/book/test_word2vec_book.py               |   8 +-
 .../distributed/test_dist_pod128_sample.py    |   2 +-
 test/ipu/distributed/test_dist_sample.py      |   2 +-
 test/ipu/test_lookuptable_op_ipu.py           |   2 +-
 test/ipu/test_weight_sharing_ipu.py           |   2 +-
 ...r_embedding_eltwise_layernorm_fuse_pass.py |  54 +++---
 test/legacy_test/dist_ctr.py                  |   4 +-
 test/legacy_test/dist_fleet_ctr.py            |   8 +-
 .../dist_fleet_heter_pipeline_ctr.py          |   4 +-
 test/legacy_test/dist_text_classification.py  |   2 +-
 test/legacy_test/dist_word2vec.py             |   8 +-
 test/legacy_test/fleet_heter_ps_training.py   |   4 +-
 test/legacy_test/nets.py                      |   2 +-
 test/legacy_test/simple_nets.py               |   2 +-
 test/legacy_test/test_communicator_geo.py     |   4 +-
 ..._dist_fleet_a_sync_optimizer_auto_async.py |   2 +-
 ...st_dist_fleet_a_sync_optimizer_auto_geo.py |   2 +-
 .../test_dist_fleet_heter_program.py          |   3 +-
 test/legacy_test/test_dist_fleet_ps.py        |   6 +-
 test/legacy_test/test_dist_fleet_ps3.py       |   6 +-
 test/legacy_test/test_dist_fleet_ps5.py       |   6 +-
 test/legacy_test/test_dist_sparse_load_ps0.py |   2 +-
 .../test_dist_sparse_tensor_load_sgd.py       |   2 +-
 test/legacy_test/test_dist_transpiler.py      |  14 +-
 test/legacy_test/test_downpoursgd.py          |   6 +-
 .../test_eager_deletion_padding_rnn.py        |   3 +-
 test/legacy_test/test_entry_attr2.py          |   2 +-
 test/legacy_test/test_fleet.py                |   2 +-
 test/legacy_test/test_fleet_base_2.py         |   4 +-
 test/legacy_test/test_fleet_nocvm_1.py        |   2 +-
 test/legacy_test/test_fleet_unitaccessor.py   |   2 +-
 test/legacy_test/test_gradient_clip.py        |   2 +-
 test/legacy_test/test_hsigmoid_op.py          |   2 +-
 test/legacy_test/test_layers.py               |  16 +-
 test/legacy_test/test_lookup_table_bf16_op.py |   4 +-
 test/legacy_test/test_lookup_table_op.py      |   9 +-
 test/legacy_test/test_monitor.py              |   4 +-
 test/legacy_test/test_regularizer.py          |   2 +-
 test/legacy_test/test_regularizer_api.py      |   2 +-
 test/legacy_test/test_sgd_op_bf16.py          |   2 +-
 test/legacy_test/test_weight_decay.py         |   2 +-
 test/legacy_test/transformer_model.py         |   5 +-
 48 files changed, 141 insertions(+), 301 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 067e5a55c9c75..a4a770a97829a 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -57,176 +57,10 @@
 
 
 __all__ = [
-    'embedding',
     'autoincreased_step_counter',
 ]
 
 
-@deprecated(since="2.0.0", update_to="paddle.nn.functional.embedding")
-def embedding(
-    input,
-    size,
-    is_sparse=False,
-    is_distributed=False,
-    padding_idx=None,
-    param_attr=None,
-    dtype='float32',
-):
-    r"""
-    :api_attr: Static Graph
-
-    **WARNING:** This OP will be deprecated in a future release. This OP requires the
-    last dimension of Tensor shape must be equal to 1. It is recommended to use
-    fluid. :ref:`api_fluid_embedding` .
-
-    The operator is used to lookup embeddings vector of ids provided by :attr:`input` .
-    It automatically constructs a 2D embedding matrix based on the
-    input :attr:`size` (vocab_size, emb_size) and :attr:`dtype` .
-
-    This OP requires the last dimension of Tensor shape must be equal to 1. The shape
-    of output Tensor is generated by replacing the last dimension of the input Tensor shape
-    with emb_size.
-
-    **Note:** The id in :attr:`input` must satisfy :math:`0 =< id < size[0]` ,
-    otherwise the program will throw an exception and exit.
-
-    .. code-block:: text
-
-        Case 1:
-
-        input is a Tensor. padding_idx = -1
-            input.data = [[[1], [3]], [[2], [4]], [[4], [127]]]
-            input.shape = [3, 2, 1]
-        Given size = [128, 16]
-        output is a Tensor:
-            out.shape = [3, 2, 16]
-            out.data = [[[0.129435295, 0.244512452, ..., 0.436322452],
-                        [0.345421456, 0.524563927, ..., 0.144534654]],
-
-                        [[0.345249859, 0.124939536, ..., 0.194353745],
-                        [0.945345345, 0.435394634, ..., 0.435345365]],
-
-                        [[0.945345345, 0.435394634, ..., 0.435345365],
-                        [0.0,         0.0,         ..., 0.0        ]]]  # padding data
-        The input padding_idx is less than 0, it is automatically converted to padding_idx = -1 + 128 = 127
-        It will pad all-zero data when ids is 127.
-
-        Case 2:
-
-        input is a LoDTensor with 1-level LoD. padding_idx = 0
-            input.lod = [[2, 3]]
-            input.data = [[1], [3], [2], [4], [0]]
-            input.shape = [5, 1]
-        Given size = [128, 16]
-        output is a LoDTensor:
-            out.lod = [[2, 3]]
-            out.shape = [5, 16]
-            out.data = [[0.129435295, 0.244512452, ..., 0.436322452],
-                        [0.345421456, 0.524563927, ..., 0.144534654],
-                        [0.345249859, 0.124939536, ..., 0.194353745],
-                        [0.945345345, 0.435394634, ..., 0.435345365],
-                        [0.0,         0.0,         ..., 0.0        ]]  # padding data
-        It will pad all-zero data when ids is 0.
-
-    Args:
-        input(Variable): A Tensor or LoDTensor with type int64, which contains the id information.
-            The last dimension of Tensor shape must be equal to 1. The value of the input id should
-            satisfy :math:`0<= id < size[0]` .
-        size(tuple|list): The shape of lookup table parameter. It should have two elements which
-            indicates the size of the dictionary of embeddings and the size of each embedding vector respectively.
-        is_sparse(bool): The flag indicating whether to use sparse update. This parameter only
-            affects the performance of the backwards gradient update. It is recommended to set
-            True because sparse update is faster. But some optimizer does not support sparse update,
-            such as :ref:`api_fluid_optimizer_AdadeltaOptimizer` , :ref:`api_fluid_optimizer_AdamaxOptimizer` ,
-            :ref:`api_fluid_optimizer_DecayedAdagradOptimizer` , :ref:`api_fluid_optimizer_FtrlOptimizer` ,
-            :ref:`api_fluid_optimizer_LambOptimizer` and :ref:`api_fluid_optimizer_LarsMomentumOptimizer` .
-            In these case, is_sparse must be False. Default: False.
-        is_distributed(bool): Whether to store the embedding matrix in a distributed manner. Only used
-            in multi-machine distributed CPU training. Default: False.
-        padding_idx(int|long|None): padding_idx needs to be in the interval [-vocab_size, vocab_size).
-            If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
-            to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup
-            encounters :math:`padding\_idx` in id. And the padding data will not be updated while training.
-            If set None, it makes no effect to output. Default: None.
-        param_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the
-            default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` . In addition,
-            user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter.
-            The local word vector needs to be transformed into numpy format, and the shape of local word
-            vector should be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer`
-            is used to load custom or pre-trained word vectors. See code example 2 for details.
-        dtype(str|core.VarDesc.VarType): It refers to the data type of output Tensor.
-            It must be float32 or float64. Default: float32.
-
-    Returns:
-        Variable: Embedding Tensor or LoDTensor mapped by input. The data type is the same as :attr:`dtype` .
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import numpy as np
-          import paddle
-          paddle.enable_static()
-
-          data = paddle.static.data(name='x', shape=[None, 1], dtype='int64')
-
-          # example 1
-          emb_1 = paddle.static.nn.embedding(input=data, size=[128, 64])
-
-          # example 2: load custom or pre-trained word vectors
-          weight_data = np.random.random(size=(128, 100))  # word vectors with numpy format
-          w_param_attrs = fluid.ParamAttr(
-              name="emb_weight",
-              learning_rate=0.5,
-              initializer=paddle.nn.initializer.Assign(weight_data),
-              trainable=True)
-          emb_2 = fluid.layers.embedding(input=data, size=(128, 100), param_attr=w_param_attrs, dtype='float32')
-    """
-
-    helper = LayerHelper('embedding', **locals())
-    check_variable_and_dtype(
-        input, 'input', ['int64'], 'fluid.layers.embedding'
-    )
-    check_dtype(
-        dtype,
-        'dtype',
-        ['uint16', 'float16', 'float32', 'float64'],
-        'fluid.layers.embedding',
-    )
-
-    if is_distributed:
-        is_distributed = False
-        warnings.warn(
-            "is_distributed is go out of use, `paddle.static.nn.sparse_embedding` is your needed"
-        )
-
-    remote_prefetch = True if is_sparse else False
-
-    w = helper.create_parameter(
-        attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False
-    )
-    tmp = helper.create_variable_for_type_inference(dtype)
-    padding_idx = (
-        -1
-        if padding_idx is None
-        else padding_idx
-        if padding_idx >= 0
-        else (size[0] + padding_idx)
-    )
-    helper.append_op(
-        type='lookup_table',
-        inputs={'Ids': input, 'W': w},
-        outputs={'Out': tmp},
-        attrs={
-            'is_sparse': is_sparse,
-            'is_distributed': is_distributed,
-            'remote_prefetch': remote_prefetch,
-            'padding_idx': padding_idx,
-        },
-    )
-    return tmp
-
-
 def autoincreased_step_counter(counter_name=None, begin=1, step=1):
     """
     :api_attr: Static Graph
diff --git a/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/__init__.py b/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/__init__.py
index 8d3fb51cc794a..800950e78f219 100644
--- a/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/__init__.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/__init__.py
@@ -157,7 +157,7 @@ def get_sparse_attrs():
 
                 if len(dist_varnames) != 0:
                     raise ValueError(
-                        "GeoStrategy can not support large scale embeding now, please use fluid.layers.embedding"
+                        "GeoStrategy can not support large scale embeding now, please use paddle.static.nn.embedding"
                     )
 
                 init_attrs = []
diff --git a/python/paddle/incubate/distributed/fleet/parameter_server/pslib/__init__.py b/python/paddle/incubate/distributed/fleet/parameter_server/pslib/__init__.py
index d4e1c77ae96f4..d8b61aadb5c0c 100644
--- a/python/paddle/incubate/distributed/fleet/parameter_server/pslib/__init__.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/pslib/__init__.py
@@ -1124,7 +1124,7 @@ class fleet_embedding:
     Example:
         .. code-block:: python
           with fleet_embedding(click_name=label.name):
-              emb = fluid.layers.embedding(
+              emb = paddle.static.nn.embedding(
                   input=var,
                   size=[-1, 11],
                   is_sparse=True,
@@ -1134,7 +1134,6 @@ class fleet_embedding:
 
     def __init__(self, click_name, scale_sparse_grad=True):
         """Init."""
-        # self.origin_emb = fluid.layers.embedding
         self.origin_emb_v2 = paddle.static.nn.embedding
         # if user uses cvm layer after embedding, click_name can be None
         self.click_name = "" if click_name is None else click_name
@@ -1144,7 +1143,6 @@ def __init__(self, click_name, scale_sparse_grad=True):
 
     def __enter__(self):
         """Enter."""
-        # fluid.layers.embedding = _fleet_embedding
         paddle.static.nn.embedding = _fleet_embedding_v2
         FLEET_GLOBAL_DICT["cur_accessor"] = self.accessor
         FLEET_GLOBAL_DICT["click_name"] = self.click_name
@@ -1152,7 +1150,6 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         """Exit."""
-        # fluid.layers.embedding = self.origin_emb
         paddle.static.nn.embedding = self.origin_emb_v2
         FLEET_GLOBAL_DICT["cur_accessor"] = ""
         FLEET_GLOBAL_DICT["click_name"] = ""
diff --git a/test/auto_parallel/test_dist_embedding.py b/test/auto_parallel/test_dist_embedding.py
index 925b5c4bdee5a..bdfdc0ef32a78 100644
--- a/test/auto_parallel/test_dist_embedding.py
+++ b/test/auto_parallel/test_dist_embedding.py
@@ -31,13 +31,26 @@ def make_program_lookup_table_v1_mp_dp():
             name='src_ids', shape=[12, 512, 1], dtype='int64'
         )
         src_ids.stop_gradient = True
-        emb_out = paddle.fluid.layers.embedding(
-            input=src_ids,
-            size=[64, 128],
-            param_attr=paddle.fluid.ParamAttr(name="emb_weight"),
-            dtype="float32",
-            is_sparse=False,
+
+        emb_out = block.create_var(name='emb_out', dtype='float32')
+        w = paddle.create_parameter(
+            attr=paddle.fluid.ParamAttr(name="emb_weight"),
+            shape=[64, 128],
+            dtype='float32',
+            is_bias=False,
+        )
+        block.append_op(
+            type='lookup_table',
+            outputs={'Out': emb_out},
+            inputs={'Ids': src_ids, 'W': w},
+            attrs={
+                'is_sparse': False,
+                'is_distributed': False,
+                'remote_prefetch': False,
+                'padding_idx': None,
+            },
         )
+
         loss = paddle.mean(emb_out)
 
         auto.shard_tensor(
diff --git a/test/book/notest_understand_sentiment.py b/test/book/notest_understand_sentiment.py
index eb4e01ae2949a..8d7cde66bce56 100644
--- a/test/book/notest_understand_sentiment.py
+++ b/test/book/notest_understand_sentiment.py
@@ -31,7 +31,7 @@
 def convolution_net(
     data, label, input_dim, class_dim=2, emb_dim=32, hid_dim=32
 ):
-    emb = fluid.layers.embedding(
+    emb = paddle.static.nn.embedding(
         input=data, size=[input_dim, emb_dim], is_sparse=True
     )
     conv_3 = nets.sequence_conv_pool(
diff --git a/test/book/test_recommender_system.py b/test/book/test_recommender_system.py
index 47cfb52c738a9..f6605a13149d7 100644
--- a/test/book/test_recommender_system.py
+++ b/test/book/test_recommender_system.py
@@ -25,7 +25,7 @@
 
 import paddle
 from paddle import fluid
-from paddle.fluid import framework, layers
+from paddle.fluid import framework
 from paddle.fluid.executor import Executor
 from paddle.fluid.optimizer import SGDOptimizer
 
@@ -44,7 +44,7 @@ def get_usr_combined_features():
 
     uid = paddle.static.data(name='user_id', shape=[-1, 1], dtype='int64')
 
-    usr_emb = layers.embedding(
+    usr_emb = paddle.static.nn.embedding(
         input=uid,
         dtype='float32',
         size=[USR_DICT_SIZE, 32],
@@ -60,7 +60,7 @@ def get_usr_combined_features():
         name='gender_id', shape=[-1, 1], dtype='int64'
     )
 
-    usr_gender_emb = layers.embedding(
+    usr_gender_emb = paddle.static.nn.embedding(
         input=usr_gender_id,
         size=[USR_GENDER_DICT_SIZE, 16],
         param_attr='gender_table',
@@ -72,7 +72,7 @@ def get_usr_combined_features():
     USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
     usr_age_id = paddle.static.data(name='age_id', shape=[-1, 1], dtype="int64")
 
-    usr_age_emb = layers.embedding(
+    usr_age_emb = paddle.static.nn.embedding(
         input=usr_age_id,
         size=[USR_AGE_DICT_SIZE, 16],
         is_sparse=IS_SPARSE,
@@ -84,7 +84,7 @@ def get_usr_combined_features():
     USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
     usr_job_id = paddle.static.data(name='job_id', shape=[-1, 1], dtype="int64")
 
-    usr_job_emb = layers.embedding(
+    usr_job_emb = paddle.static.nn.embedding(
         input=usr_job_id,
         size=[USR_JOB_DICT_SIZE, 16],
         param_attr='job_table',
@@ -109,7 +109,7 @@ def get_mov_combined_features():
 
     mov_id = paddle.static.data(name='movie_id', shape=[-1, 1], dtype='int64')
 
-    mov_emb = layers.embedding(
+    mov_emb = paddle.static.nn.embedding(
         input=mov_id,
         dtype='float32',
         size=[MOV_DICT_SIZE, 32],
@@ -125,12 +125,12 @@ def get_mov_combined_features():
         name='category_id', shape=[-1, 1], dtype='int64', lod_level=1
     )
 
-    mov_categories_emb = layers.embedding(
+    mov_categories_emb = paddle.static.nn.embedding(
         input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE
     )
 
     mov_categories_hidden = paddle.static.nn.sequence_lod.sequence_pool(
-        input=mov_categories_emb, pool_type="sum"
+        input=mov_categories_emb.squeeze(-2), pool_type="sum"
     )
 
     MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
@@ -139,12 +139,12 @@ def get_mov_combined_features():
         name='movie_title', shape=[-1, 1], dtype='int64', lod_level=1
     )
 
-    mov_title_emb = layers.embedding(
+    mov_title_emb = paddle.static.nn.embedding(
         input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE
     )
 
     mov_title_conv = nets.sequence_conv_pool(
-        input=mov_title_emb,
+        input=mov_title_emb.squeeze(-2),
         num_filters=32,
         filter_size=3,
         act="tanh",
diff --git a/test/book/test_word2vec_book.py b/test/book/test_word2vec_book.py
index cdebfc58cfa9c..0c59f005a2287 100644
--- a/test/book/test_word2vec_book.py
+++ b/test/book/test_word2vec_book.py
@@ -58,28 +58,28 @@ def train(
     IS_SPARSE = is_sparse
 
     def __network__(words):
-        embed_first = fluid.layers.embedding(
+        embed_first = paddle.static.nn.embedding(
             input=words[0],
             size=[dict_size, EMBED_SIZE],
             dtype='float32',
             is_sparse=IS_SPARSE,
             param_attr='shared_w',
         )
-        embed_second = fluid.layers.embedding(
+        embed_second = paddle.static.nn.embedding(
             input=words[1],
             size=[dict_size, EMBED_SIZE],
             dtype='float32',
             is_sparse=IS_SPARSE,
             param_attr='shared_w',
         )
-        embed_third = fluid.layers.embedding(
+        embed_third = paddle.static.nn.embedding(
             input=words[2],
             size=[dict_size, EMBED_SIZE],
             dtype='float32',
             is_sparse=IS_SPARSE,
             param_attr='shared_w',
         )
-        embed_forth = fluid.layers.embedding(
+        embed_forth = paddle.static.nn.embedding(
             input=words[3],
             size=[dict_size, EMBED_SIZE],
             dtype='float32',
diff --git a/test/ipu/distributed/test_dist_pod128_sample.py b/test/ipu/distributed/test_dist_pod128_sample.py
index 40a081a356ce0..9b0a33dfd87fb 100644
--- a/test/ipu/distributed/test_dist_pod128_sample.py
+++ b/test/ipu/distributed/test_dist_pod128_sample.py
@@ -59,7 +59,7 @@ def TestDistTraining():
         with paddle.static.program_guard(main_prog, startup_prog):
             x = paddle.static.data(name="x", shape=[3, 2, 1], dtype='int64')
             with paddle.static.ipu_shard_guard(index=0, stage=0):
-                out = paddle.fluid.layers.embedding(x, **attrs)
+                out = paddle.static.nn.embedding(x, **attrs)
             with paddle.static.ipu_shard_guard(index=1, stage=1):
                 loss = paddle.mean(out)
             opt = paddle.optimizer.Adam(learning_rate=1e-1)
diff --git a/test/ipu/distributed/test_dist_sample.py b/test/ipu/distributed/test_dist_sample.py
index 1300b2807eabe..a5506db7e349f 100644
--- a/test/ipu/distributed/test_dist_sample.py
+++ b/test/ipu/distributed/test_dist_sample.py
@@ -77,7 +77,7 @@ def Test(use_dist, file_name):
         with paddle.static.program_guard(main_prog, startup_prog):
             x = paddle.static.data(name="x", shape=[3, 2, 1], dtype='int64')
 
-            out = paddle.fluid.layers.embedding(x, **attrs)
+            out = paddle.static.nn.embedding(x, **attrs)
             loss = paddle.mean(out)
             opt = paddle.optimizer.Adam(learning_rate=1e-1)
             opt.minimize(loss)
diff --git a/test/ipu/test_lookuptable_op_ipu.py b/test/ipu/test_lookuptable_op_ipu.py
index e0e2b7ae59877..cf93159fbb1ec 100644
--- a/test/ipu/test_lookuptable_op_ipu.py
+++ b/test/ipu/test_lookuptable_op_ipu.py
@@ -53,7 +53,7 @@ def build_model(self):
         x = paddle.static.data(
             name=self.feed_list[0], shape=self.feed_shape[0], dtype='int64'
         )
-        out = paddle.fluid.layers.embedding(x, **self.attrs)
+        out = paddle.static.nn.embedding(x, **self.attrs)
         if self.is_training:
             loss = paddle.mean(out)
             adam = paddle.optimizer.Adam(learning_rate=1e-2)
diff --git a/test/ipu/test_weight_sharing_ipu.py b/test/ipu/test_weight_sharing_ipu.py
index 75ac2f5783199..9f114fec99ab6 100644
--- a/test/ipu/test_weight_sharing_ipu.py
+++ b/test/ipu/test_weight_sharing_ipu.py
@@ -55,7 +55,7 @@ def build_model(self):
             name=self.feed_list[0], shape=self.feed_shape[0], dtype='int64'
         )
         with paddle.static.ipu_shard_guard(index=0, stage=0):
-            y = paddle.fluid.layers.embedding(
+            y = paddle.static.nn.embedding(
                 input=x,
                 size=[768, 768],
                 dtype='float32',
diff --git a/test/ir/test_ir_embedding_eltwise_layernorm_fuse_pass.py b/test/ir/test_ir_embedding_eltwise_layernorm_fuse_pass.py
index 260f0a8913e9d..dbdcdffdf5be1 100644
--- a/test/ir/test_ir_embedding_eltwise_layernorm_fuse_pass.py
+++ b/test/ir/test_ir_embedding_eltwise_layernorm_fuse_pass.py
@@ -27,26 +27,26 @@ def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             word_id = paddle.static.data(
                 name="word_id",
-                shape=[1, 128, 1],
+                shape=[1, 128],
                 dtype="int64",
             )
             pos_id = paddle.static.data(
                 name="pos_id",
-                shape=[1, 128, 1],
+                shape=[1, 128],
                 dtype="int64",
             )
             sent_id = paddle.static.data(
                 name="sent_id",
-                shape=[1, 128, 1],
+                shape=[1, 128],
                 dtype="int64",
             )
-            word_emb = fluid.layers.embedding(
+            word_emb = paddle.static.nn.embedding(
                 input=word_id, size=(128, 768), dtype='float32'
             )
-            pos_emb = fluid.layers.embedding(
+            pos_emb = paddle.static.nn.embedding(
                 input=pos_id, size=(128, 768), dtype='float32'
             )
-            sent_emb = fluid.layers.embedding(
+            sent_emb = paddle.static.nn.embedding(
                 input=sent_id, size=(128, 768), dtype='float32'
             )
             add1 = paddle.add(word_emb, pos_emb)
@@ -55,34 +55,34 @@ def setUp(self):
 
             id1 = paddle.static.data(
                 name="id1",
-                shape=[1, 128, 1],
+                shape=[1, 128],
                 dtype="int64",
             )
             id2 = paddle.static.data(
                 name="id2",
-                shape=[1, 128, 1],
+                shape=[1, 128],
                 dtype="int64",
             )
             id3 = paddle.static.data(
                 name="id3",
-                shape=[1, 128, 1],
+                shape=[1, 128],
                 dtype="int64",
             )
             id4 = paddle.static.data(
                 name="id4",
-                shape=[1, 128, 1],
+                shape=[1, 128],
                 dtype="int64",
             )
-            emb1 = fluid.layers.embedding(
+            emb1 = paddle.static.nn.embedding(
                 input=id1, size=(128, 768), dtype='float32'
             )
-            emb2 = fluid.layers.embedding(
+            emb2 = paddle.static.nn.embedding(
                 input=id2, size=(128, 768), dtype='float32'
             )
-            emb3 = fluid.layers.embedding(
+            emb3 = paddle.static.nn.embedding(
                 input=id3, size=(128, 768), dtype='float32'
             )
-            emb4 = fluid.layers.embedding(
+            emb4 = paddle.static.nn.embedding(
                 input=id4, size=(128, 768), dtype='float32'
             )
             add_1 = paddle.add(emb1, emb2)
@@ -93,25 +93,25 @@ def setUp(self):
             )
 
         self.feeds = {
-            "word_id": np.random.randint(
-                low=0, high=128, size=(1, 128, 1)
-            ).astype("int64"),
-            "pos_id": np.random.randint(
-                low=0, high=128, size=(1, 128, 1)
-            ).astype("int64"),
-            "sent_id": np.random.randint(
-                low=0, high=128, size=(1, 128, 1)
-            ).astype("int64"),
-            "id1": np.random.randint(low=0, high=128, size=(1, 128, 1)).astype(
+            "word_id": np.random.randint(low=0, high=128, size=(1, 128)).astype(
                 "int64"
             ),
-            "id2": np.random.randint(low=0, high=128, size=(1, 128, 1)).astype(
+            "pos_id": np.random.randint(low=0, high=128, size=(1, 128)).astype(
                 "int64"
             ),
-            "id3": np.random.randint(low=0, high=128, size=(1, 128, 1)).astype(
+            "sent_id": np.random.randint(low=0, high=128, size=(1, 128)).astype(
                 "int64"
             ),
-            "id4": np.random.randint(low=0, high=128, size=(1, 128, 1)).astype(
+            "id1": np.random.randint(low=0, high=128, size=(1, 128)).astype(
+                "int64"
+            ),
+            "id2": np.random.randint(low=0, high=128, size=(1, 128)).astype(
+                "int64"
+            ),
+            "id3": np.random.randint(low=0, high=128, size=(1, 128)).astype(
+                "int64"
+            ),
+            "id4": np.random.randint(low=0, high=128, size=(1, 128)).astype(
                 "int64"
             ),
         }
diff --git a/test/legacy_test/dist_ctr.py b/test/legacy_test/dist_ctr.py
index 4056e5bc2285e..148203d61ec68 100644
--- a/test/legacy_test/dist_ctr.py
+++ b/test/legacy_test/dist_ctr.py
@@ -53,7 +53,7 @@ def get_model(self, batch_size=2):
 
         # build dnn model
         dnn_layer_dims = [128, 64, 32, 1]
-        dnn_embedding = fluid.layers.embedding(
+        dnn_embedding = paddle.static.nn.embedding(
             is_distributed=False,
             input=dnn_data,
             size=[dnn_input_dim, dnn_layer_dims[0]],
@@ -80,7 +80,7 @@ def get_model(self, batch_size=2):
             dnn_out = fc
 
         # build lr model
-        lr_embedding = fluid.layers.embedding(
+        lr_embedding = paddle.static.nn.embedding(
             is_distributed=False,
             input=lr_data,
             size=[lr_input_dim, 1],
diff --git a/test/legacy_test/dist_fleet_ctr.py b/test/legacy_test/dist_fleet_ctr.py
index 7ac56dfff5886..a5634a0cfba28 100644
--- a/test/legacy_test/dist_fleet_ctr.py
+++ b/test/legacy_test/dist_fleet_ctr.py
@@ -101,7 +101,7 @@ def net(self, args, is_train=True, batch_size=4, lr=0.01):
 
         # build dnn model
         dnn_layer_dims = [128, 128, 64, 32, 1]
-        dnn_embedding = fluid.layers.embedding(
+        dnn_embedding = paddle.static.nn.embedding(
             is_distributed=False,
             input=dnn_data,
             size=[dnn_input_dim, dnn_layer_dims[0]],
@@ -113,7 +113,7 @@ def net(self, args, is_train=True, batch_size=4, lr=0.01):
             padding_idx=0,
         )
         dnn_pool = paddle.static.nn.sequence_lod.sequence_pool(
-            input=dnn_embedding, pool_type="sum"
+            input=dnn_embedding.squeeze(-2), pool_type="sum"
         )
         dnn_out = dnn_pool
         for i, dim in enumerate(dnn_layer_dims[1:]):
@@ -129,7 +129,7 @@ def net(self, args, is_train=True, batch_size=4, lr=0.01):
             dnn_out = fc
 
         # build lr model
-        lr_embedding = fluid.layers.embedding(
+        lr_embbding = paddle.static.nn.embedding(
             is_distributed=False,
             input=lr_data,
             size=[lr_input_dim, 1],
@@ -141,7 +141,7 @@ def net(self, args, is_train=True, batch_size=4, lr=0.01):
             padding_idx=0,
         )
         lr_pool = paddle.static.nn.sequence_lod.sequence_pool(
-            input=lr_embedding, pool_type="sum"
+            input=lr_embbding.squeeze(-2), pool_type="sum"
         )
 
         merge_layer = paddle.concat([dnn_out, lr_pool], axis=1)
diff --git a/test/legacy_test/dist_fleet_heter_pipeline_ctr.py b/test/legacy_test/dist_fleet_heter_pipeline_ctr.py
index 37a37ea2f7c35..db7bcf8fac1a4 100644
--- a/test/legacy_test/dist_fleet_heter_pipeline_ctr.py
+++ b/test/legacy_test/dist_fleet_heter_pipeline_ctr.py
@@ -72,7 +72,7 @@ def net(self, args, batch_size=4, lr=0.01):
 
             # build dnn model
             dnn_layer_dims = [128, 64, 32, 1]
-            dnn_embedding = fluid.layers.embedding(
+            dnn_embedding = paddle.static.nn.embedding(
                 is_distributed=False,
                 input=dnn_data,
                 size=[dnn_input_dim, dnn_layer_dims[0]],
@@ -88,7 +88,7 @@ def net(self, args, batch_size=4, lr=0.01):
             dnn_out = dnn_pool
 
             # build lr model
-            lr_embedding = fluid.layers.embedding(
+            lr_embedding = paddle.static.nn.embedding(
                 is_distributed=False,
                 input=lr_data,
                 size=[lr_input_dim, 1],
diff --git a/test/legacy_test/dist_text_classification.py b/test/legacy_test/dist_text_classification.py
index bad17a3b6abde..0736fb6a38550 100644
--- a/test/legacy_test/dist_text_classification.py
+++ b/test/legacy_test/dist_text_classification.py
@@ -55,7 +55,7 @@ def conv_net(
     fc0_dim=96,
     class_dim=2,
 ):
-    emb = fluid.layers.embedding(
+    emb = paddle.static.nn.embedding(
         input=input,
         size=[dict_dim, emb_dim],
         is_sparse=False,
diff --git a/test/legacy_test/dist_word2vec.py b/test/legacy_test/dist_word2vec.py
index bbda8ac3558be..3764fd5c5dcba 100644
--- a/test/legacy_test/dist_word2vec.py
+++ b/test/legacy_test/dist_word2vec.py
@@ -34,7 +34,7 @@ def get_model(self, batch_size=2):
         BATCH_SIZE = batch_size
 
         def __network__(words):
-            embed_first = fluid.layers.embedding(
+            embed_first = paddle.static.nn.embedding(
                 input=words[0],
                 size=[dict_size, EMBED_SIZE],
                 dtype='float32',
@@ -44,7 +44,7 @@ def __network__(words):
                     initializer=paddle.nn.initializer.Constant(value=0.1),
                 ),
             )
-            embed_second = fluid.layers.embedding(
+            embed_second = paddle.static.nn.embedding(
                 input=words[1],
                 size=[dict_size, EMBED_SIZE],
                 dtype='float32',
@@ -54,7 +54,7 @@ def __network__(words):
                     initializer=paddle.nn.initializer.Constant(value=0.1),
                 ),
             )
-            embed_third = fluid.layers.embedding(
+            embed_third = paddle.static.nn.embedding(
                 input=words[2],
                 size=[dict_size, EMBED_SIZE],
                 dtype='float32',
@@ -64,7 +64,7 @@ def __network__(words):
                     initializer=paddle.nn.initializer.Constant(value=0.1),
                 ),
             )
-            embed_forth = fluid.layers.embedding(
+            embed_forth = paddle.static.nn.embedding(
                 input=words[3],
                 size=[dict_size, EMBED_SIZE],
                 dtype='float32',
diff --git a/test/legacy_test/fleet_heter_ps_training.py b/test/legacy_test/fleet_heter_ps_training.py
index aec4634fbed16..4871506e58aaa 100644
--- a/test/legacy_test/fleet_heter_ps_training.py
+++ b/test/legacy_test/fleet_heter_ps_training.py
@@ -64,7 +64,7 @@ def net(batch_size=4, lr=0.01):
 
         # build dnn model
         dnn_layer_dims = [2, 1]
-        dnn_embedding = fluid.layers.embedding(
+        dnn_embedding = paddle.static.nn.embedding(
             is_distributed=False,
             input=dnn_data,
             size=[dnn_input_dim, dnn_layer_dims[0]],
@@ -80,7 +80,7 @@ def net(batch_size=4, lr=0.01):
         dnn_out = dnn_pool
 
         # build lr model
-        lr_embedding = fluid.layers.embedding(
+        lr_embedding = paddle.static.nn.embedding(
             is_distributed=False,
             input=lr_data,
             size=[lr_input_dim, 1],
diff --git a/test/legacy_test/nets.py b/test/legacy_test/nets.py
index 0727bf7ead038..16a947b221e8b 100644
--- a/test/legacy_test/nets.py
+++ b/test/legacy_test/nets.py
@@ -330,7 +330,7 @@ def sequence_conv_pool(
             emb_dim = 128
             hid_dim = 512
             data = paddle.static.data(name="words", shape=[None, 1], dtype="int64", lod_level=1)
-            emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim], is_sparse=True)
+            emb = paddle.static.nn.embedding(input=data, size=[input_dim, emb_dim], is_sparse=True)
             seq_conv = fluid.nets.sequence_conv_pool(input=emb,
                                                      num_filters=hid_dim,
                                                      filter_size=3,
diff --git a/test/legacy_test/simple_nets.py b/test/legacy_test/simple_nets.py
index 8d19bbe08da55..8ff57cdce22db 100644
--- a/test/legacy_test/simple_nets.py
+++ b/test/legacy_test/simple_nets.py
@@ -93,7 +93,7 @@ def bow_net(
         name="words", shape=[-1, 1], dtype="int64", lod_level=1
     )
     label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64")
-    emb = fluid.layers.embedding(
+    emb = paddle.static.nn.embedding(
         input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim]
     )
     bow = paddle.static.nn.sequence_lod.sequence_pool(
diff --git a/test/legacy_test/test_communicator_geo.py b/test/legacy_test/test_communicator_geo.py
index 1c93b92b92b50..64a207160243d 100644
--- a/test/legacy_test/test_communicator_geo.py
+++ b/test/legacy_test/test_communicator_geo.py
@@ -36,7 +36,7 @@ def net(self):
             name='x1', shape=[-1, 1], dtype='int64', lod_level=1
         )
 
-        emb = fluid.layers.embedding(
+        emb = paddle.static.nn.embedding(
             input=x1,
             size=[10000, 10],
             param_attr=fluid.ParamAttr(
@@ -47,7 +47,7 @@ def net(self):
         )
 
         pool = paddle.static.nn.sequence_lod.sequence_pool(
-            input=emb, pool_type="sum"
+            input=emb.squeeze(-2), pool_type="sum"
         )
         z = paddle.concat([x, pool], axis=1)
 
diff --git a/test/legacy_test/test_dist_fleet_a_sync_optimizer_auto_async.py b/test/legacy_test/test_dist_fleet_a_sync_optimizer_auto_async.py
index 8784e22c7b786..b7386500c4313 100644
--- a/test/legacy_test/test_dist_fleet_a_sync_optimizer_auto_async.py
+++ b/test/legacy_test/test_dist_fleet_a_sync_optimizer_auto_async.py
@@ -52,7 +52,7 @@ def test_a_sync_optimizer3(self):
             dtype="int64",
             lod_level=1,
         )
-        x_embedding = paddle.fluid.layers.embedding(
+        x_embedding = paddle.static.nn.embedding(
             is_distributed=False,
             input=input_x,
             size=[1000000000, 100000],
diff --git a/test/legacy_test/test_dist_fleet_a_sync_optimizer_auto_geo.py b/test/legacy_test/test_dist_fleet_a_sync_optimizer_auto_geo.py
index d43a4397ac3a7..bde7a3d1820be 100644
--- a/test/legacy_test/test_dist_fleet_a_sync_optimizer_auto_geo.py
+++ b/test/legacy_test/test_dist_fleet_a_sync_optimizer_auto_geo.py
@@ -49,7 +49,7 @@ def test_a_sync_optimizer2(self):
         input_x = paddle.static.data(name="x", shape=[-1, 1], dtype='int64')
         input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
 
-        emb = paddle.fluid.layers.embedding(
+        emb = paddle.static.nn.embedding(
             input=input_x, size=[100, 10], is_sparse=True
         )
 
diff --git a/test/legacy_test/test_dist_fleet_heter_program.py b/test/legacy_test/test_dist_fleet_heter_program.py
index 9ee46aadc51da..aad71627a9929 100644
--- a/test/legacy_test/test_dist_fleet_heter_program.py
+++ b/test/legacy_test/test_dist_fleet_heter_program.py
@@ -83,9 +83,8 @@ def build_input(self):
 
     def build_net(self, inputs):
         def embedding_layer(input):
-            return fluid.layers.embedding(
+            return paddle.static.nn.sparse_embedding(
                 input=input,
-                is_sparse=True,
                 size=[100001, 10],
                 param_attr=fluid.ParamAttr(
                     name="SparseFeatFactors",
diff --git a/test/legacy_test/test_dist_fleet_ps.py b/test/legacy_test/test_dist_fleet_ps.py
index eb423b3c341fc..03ee5bb67fcc5 100644
--- a/test/legacy_test/test_dist_fleet_ps.py
+++ b/test/legacy_test/test_dist_fleet_ps.py
@@ -75,7 +75,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             name="query_ids", shape=[-1, 1], dtype="int64", lod_level=1
         )
         # embedding
-        q_emb = fluid.layers.embedding(
+        q_emb = paddle.static.nn.embedding(
             input=q,
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
@@ -109,7 +109,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             name="pos_title_ids", shape=[-1, 1], dtype="int64", lod_level=1
         )
         # embedding
-        pt_emb = fluid.layers.embedding(
+        pt_emb = paddle.static.nn.embedding(
             input=pt,
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
@@ -142,7 +142,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             name="neg_title_ids", shape=[-1, 1], dtype="int64", lod_level=1
         )
         # embedding
-        nt_emb = fluid.layers.embedding(
+        nt_emb = paddle.static.nn.embedding(
             input=nt,
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
diff --git a/test/legacy_test/test_dist_fleet_ps3.py b/test/legacy_test/test_dist_fleet_ps3.py
index 9f1ff73b83018..59ca7c7dc6188 100644
--- a/test/legacy_test/test_dist_fleet_ps3.py
+++ b/test/legacy_test/test_dist_fleet_ps3.py
@@ -75,7 +75,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             name="query_ids", shape=[-1, 1], dtype="int64", lod_level=1
         )
         # embedding
-        q_emb = fluid.layers.embedding(
+        q_emb = paddle.static.nn.embedding(
             input=q,
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
@@ -109,7 +109,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             name="pos_title_ids", shape=[-1, 1], dtype="int64", lod_level=1
         )
         # embedding
-        pt_emb = fluid.layers.embedding(
+        pt_emb = paddle.static.nn.embedding(
             input=pt,
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
@@ -142,7 +142,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             name="neg_title_ids", shape=[-1, 1], dtype="int64", lod_level=1
         )
         # embedding
-        nt_emb = fluid.layers.embedding(
+        nt_emb = paddle.static.nn.embedding(
             input=nt,
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
diff --git a/test/legacy_test/test_dist_fleet_ps5.py b/test/legacy_test/test_dist_fleet_ps5.py
index efc70346ab159..a7c363bd8287a 100644
--- a/test/legacy_test/test_dist_fleet_ps5.py
+++ b/test/legacy_test/test_dist_fleet_ps5.py
@@ -75,7 +75,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             name="query_ids", shape=[-1, 1], dtype="int64", lod_level=1
         )
         # embedding
-        q_emb = fluid.layers.embedding(
+        q_emb = paddle.static.nn.embedding(
             input=q,
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
@@ -109,7 +109,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             name="pos_title_ids", shape=[-1, 1], dtype="int64", lod_level=1
         )
         # embedding
-        pt_emb = fluid.layers.embedding(
+        pt_emb = paddle.static.nn.embedding(
             input=pt,
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
@@ -142,7 +142,7 @@ def get_loss(cos_q_pt, cos_q_nt):
             name="neg_title_ids", shape=[-1, 1], dtype="int64", lod_level=1
         )
         # embedding
-        nt_emb = fluid.layers.embedding(
+        nt_emb = paddle.static.nn.embedding(
             input=nt,
             is_distributed=is_distributed,
             size=[dict_dim, emb_dim],
diff --git a/test/legacy_test/test_dist_sparse_load_ps0.py b/test/legacy_test/test_dist_sparse_load_ps0.py
index 7eded27da1f58..bd1ebef36f25e 100644
--- a/test/legacy_test/test_dist_sparse_load_ps0.py
+++ b/test/legacy_test/test_dist_sparse_load_ps0.py
@@ -34,7 +34,7 @@ def net(self, emb_array, fc_array):
                 'input', shape=[None, 1], dtype="int64"
             )
 
-            emb = fluid.layers.embedding(
+            emb = paddle.static.nn.embedding(
                 input=dense_input,
                 is_sparse=True,
                 size=[10, 10],
diff --git a/test/legacy_test/test_dist_sparse_tensor_load_sgd.py b/test/legacy_test/test_dist_sparse_tensor_load_sgd.py
index 4c08ca52beaa1..63f39626488fe 100644
--- a/test/legacy_test/test_dist_sparse_tensor_load_sgd.py
+++ b/test/legacy_test/test_dist_sparse_tensor_load_sgd.py
@@ -49,7 +49,7 @@ def net(self):
                     inputs = paddle.static.data(
                         'input', shape=[None, 1], dtype="int64"
                     )
-                    emb = fluid.layers.embedding(
+                    emb = paddle.static.nn.embedding(
                         inputs, is_sparse=True, size=[10000, 128]
                     )
                     fc1 = paddle.static.nn.fc(
diff --git a/test/legacy_test/test_dist_transpiler.py b/test/legacy_test/test_dist_transpiler.py
index 73ca10308eb87..14a3baf95d7a5 100644
--- a/test/legacy_test/test_dist_transpiler.py
+++ b/test/legacy_test/test_dist_transpiler.py
@@ -352,7 +352,7 @@ def net_conf(self):
         inputs = [input_word, true_word, neg_word]
 
         init_width = 0.5 / embedding_size
-        input_emb = fluid.layers.embedding(
+        input_emb = paddle.static.nn.embedding(
             input=inputs[0],
             is_sparse=True,
             size=[dict_size, embedding_size],
@@ -364,7 +364,7 @@ def net_conf(self):
             ),
         )
 
-        true_emb_w = fluid.layers.embedding(
+        true_emb_w = paddle.static.nn.embedding(
             input=inputs[1],
             is_sparse=True,
             size=[dict_size, embedding_size],
@@ -374,7 +374,7 @@ def net_conf(self):
             ),
         )
 
-        true_emb_b = fluid.layers.embedding(
+        true_emb_b = paddle.static.nn.embedding(
             input=inputs[1],
             is_sparse=True,
             size=[dict_size, 1],
@@ -387,7 +387,7 @@ def net_conf(self):
         neg_word_reshape = paddle.reshape(inputs[2], shape=[-1, 1])
         neg_word_reshape.stop_gradient = True
 
-        neg_emb_w = fluid.layers.embedding(
+        neg_emb_w = paddle.static.nn.embedding(
             input=neg_word_reshape,
             is_sparse=True,
             size=[dict_size, embedding_size],
@@ -398,7 +398,7 @@ def net_conf(self):
             neg_emb_w, shape=[-1, neg_num, embedding_size]
         )
 
-        neg_emb_b = fluid.layers.embedding(
+        neg_emb_b = paddle.static.nn.embedding(
             input=neg_word_reshape,
             is_sparse=True,
             size=[dict_size, 1],
@@ -712,7 +712,7 @@ def network_with_table(self, is_sparse, is_distributed):
         self.lookup_table_name = 'shared_w'
 
         def emb_pool(ids, table_name, is_distributed):
-            emb = fluid.layers.embedding(
+            emb = paddle.static.nn.embedding(
                 input=ids,
                 size=[self.table_size, self.emb_size],
                 dtype='float32',
@@ -1427,7 +1427,7 @@ def network_with_table(self, is_sparse, is_distributed):
             )
         )
 
-        emb = fluid.layers.embedding(
+        emb = paddle.static.nn.embedding(
             input=input,
             is_sparse=is_sparse,
             size=[3, 3],
diff --git a/test/legacy_test/test_downpoursgd.py b/test/legacy_test/test_downpoursgd.py
index e5294926e9e6b..5bb65133b98a1 100644
--- a/test/legacy_test/test_downpoursgd.py
+++ b/test/legacy_test/test_downpoursgd.py
@@ -53,7 +53,7 @@ def test_device_work_use_cvm(self):
                 )
                 os.system(cmd)
             x = paddle.static.data(name='x', shape=[-1, 1], dtype='int64')
-            x_emb = fluid.layers.embedding(
+            x_emb = paddle.static.nn.embedding(
                 input=x, size=[1, 2], is_distributed=True
             )
             y_predict = paddle.static.nn.fc(x=x_emb, size=1)
@@ -117,7 +117,7 @@ def test_device_work(self):
                 )
                 os.system(cmd)
             x = paddle.static.data(name='x', shape=[-1, 1], dtype='int64')
-            x_emb = fluid.layers.embedding(
+            x_emb = paddle.static.nn.embedding(
                 input=x, size=[1, 2], is_distributed=True
             )
             y_predict = paddle.static.nn.fc(x=x_emb, size=1)
@@ -179,7 +179,7 @@ def test_downpour_opt_work(self):
                 )
                 os.system(cmd)
             x = paddle.static.data(name='x', shape=[-1, 1], dtype='int64')
-            x_emb = fluid.layers.embedding(
+            x_emb = paddle.static.nn.embedding(
                 input=x, size=[1, 2], is_distributed=True
             )
             y_predict = paddle.static.nn.fc(x=x_emb, size=1)
diff --git a/test/legacy_test/test_eager_deletion_padding_rnn.py b/test/legacy_test/test_eager_deletion_padding_rnn.py
index 29195c3a2fc12..bb00a8a4e20ff 100644
--- a/test/legacy_test/test_eager_deletion_padding_rnn.py
+++ b/test/legacy_test/test_eager_deletion_padding_rnn.py
@@ -19,7 +19,6 @@
 
 import paddle
 from paddle import fluid
-from paddle.fluid import layers
 from paddle.fluid.executor import Executor
 
 os.environ["CPU_NUM"] = "1"
@@ -241,7 +240,7 @@ def encoder_static(
         init_cell, shape=[num_layers, -1, hidden_size]
     )
 
-    x_emb = layers.embedding(
+    x_emb = paddle.static.nn.embedding(
         input=x,
         size=[vocab_size, hidden_size],
         dtype='float32',
diff --git a/test/legacy_test/test_entry_attr2.py b/test/legacy_test/test_entry_attr2.py
index c8b4af3b2d853..358e43c088cd2 100644
--- a/test/legacy_test/test_entry_attr2.py
+++ b/test/legacy_test/test_entry_attr2.py
@@ -31,7 +31,7 @@ def embedding_layer(self):
                 input = paddle.static.data(
                     name="dnn_data", shape=[-1, 1], dtype="int64", lod_level=1
                 )
-                emb = fluid.layers.embedding(
+                emb = paddle.static.nn.embedding(
                     input=input,
                     size=[100, 10],
                     is_sparse=True,
diff --git a/test/legacy_test/test_fleet.py b/test/legacy_test/test_fleet.py
index c861304d9a2af..245fa15ec83bf 100644
--- a/test/legacy_test/test_fleet.py
+++ b/test/legacy_test/test_fleet.py
@@ -62,7 +62,7 @@ def test_pslib_1(self):
                 dtype="int64",
                 lod_level=1,
             )
-            emb = fluid.layers.embedding(
+            emb = paddle.static.nn.embedding(
                 input=show,
                 size=[1, 1],
                 is_sparse=True,
diff --git a/test/legacy_test/test_fleet_base_2.py b/test/legacy_test/test_fleet_base_2.py
index c1d7d49326585..667de8759f6b6 100644
--- a/test/legacy_test/test_fleet_base_2.py
+++ b/test/legacy_test/test_fleet_base_2.py
@@ -45,9 +45,7 @@ def test_ps_minimize(self):
         )
         input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
 
-        emb = paddle.fluid.layers.embedding(
-            input=input_slot, size=[10, 9], is_sparse=True
-        )
+        emb = paddle.static.nn.sparse_embedding(input=input_slot, size=[10, 9])
         input_x = paddle.concat(x=[input_x, emb], axis=1)
         fc_1 = paddle.static.nn.fc(x=input_x, size=64, activation='tanh')
         fc_2 = paddle.static.nn.fc(x=fc_1, size=64, activation='tanh')
diff --git a/test/legacy_test/test_fleet_nocvm_1.py b/test/legacy_test/test_fleet_nocvm_1.py
index 8fbe4984e2f3f..26c94cbb54228 100644
--- a/test/legacy_test/test_fleet_nocvm_1.py
+++ b/test/legacy_test/test_fleet_nocvm_1.py
@@ -62,7 +62,7 @@ def test_pslib_1(self):
                 dtype="int64",
                 lod_level=1,
             )
-            emb = fluid.layers.embedding(
+            emb = paddle.static.nn.embedding(
                 input=show,
                 size=[1, 1],
                 is_sparse=True,
diff --git a/test/legacy_test/test_fleet_unitaccessor.py b/test/legacy_test/test_fleet_unitaccessor.py
index 4145f9f1ce9b2..2228a8f6863f8 100644
--- a/test/legacy_test/test_fleet_unitaccessor.py
+++ b/test/legacy_test/test_fleet_unitaccessor.py
@@ -59,7 +59,7 @@ def test_pslib_1(self):
             show = paddle.static.data(
                 name="show", shape=[-1, 1], dtype="int64", lod_level=1
             )
-            emb = fluid.layers.embedding(
+            emb = paddle.static.nn.embedding(
                 input=show,
                 size=[1, 1],
                 is_sparse=True,
diff --git a/test/legacy_test/test_gradient_clip.py b/test/legacy_test/test_gradient_clip.py
index 5b72f03339e31..cc91c85bee0d1 100644
--- a/test/legacy_test/test_gradient_clip.py
+++ b/test/legacy_test/test_gradient_clip.py
@@ -33,7 +33,7 @@ def bow_net(
     This model is from https://github.com/PaddlePaddle/models:
     fluid/PaddleNLP/text_classification/nets.py
     """
-    emb = fluid.layers.embedding(
+    emb = paddle.static.nn.embedding(
         input=data, is_sparse=True, size=[dict_dim, emb_dim]
     )
     bow = paddle.static.nn.sequence_lod.sequence_pool(
diff --git a/test/legacy_test/test_hsigmoid_op.py b/test/legacy_test/test_hsigmoid_op.py
index 752fbab31d57a..5e566a75d04bc 100644
--- a/test/legacy_test/test_hsigmoid_op.py
+++ b/test/legacy_test/test_hsigmoid_op.py
@@ -294,7 +294,7 @@ def hs_net_conf(self, is_sparse):
 
         data_list = [input_word, path_table, path_code, label]
 
-        emb = fluid.layers.embedding(
+        emb = paddle.static.nn.embedding(
             input=input_word,
             is_sparse=is_sparse,
             size=[3, 3],
diff --git a/test/legacy_test/test_layers.py b/test/legacy_test/test_layers.py
index e345be328b505..25d5ad42bea48 100644
--- a/test/legacy_test/test_layers.py
+++ b/test/legacy_test/test_layers.py
@@ -24,7 +24,7 @@
 import paddle
 import paddle.nn.functional as F
 from paddle import fluid
-from paddle.fluid import core, layers
+from paddle.fluid import core
 from paddle.fluid.dygraph import base, to_variable
 from paddle.fluid.framework import Program, default_main_program, program_guard
 from paddle.incubate.layers.nn import (
@@ -609,8 +609,8 @@ def test_embeding(self):
                 name='word', shape=[-1, 1], dtype='int64'
             )
             data_t.desc.set_need_check_feed(False)
-            emb = layers.embedding(
-                input=data_t,
+            emb = paddle.static.nn.embedding(
+                input=data_t.squeeze(-2),
                 size=[dict_size, 32],
                 param_attr='emb.w',
                 is_sparse=False,
@@ -1662,26 +1662,26 @@ def make_word_embedding(self):
             forth_word = self._get_data(name='forthw', shape=[1], dtype='int64')
             next_word = self._get_data(name='nextw', shape=[1], dtype='int64')
 
-            embed_first = layers.embedding(
+            embed_first = paddle.static.nn.embedding(
                 input=first_word,
                 size=[dict_size, embed_size],
                 dtype='float32',
                 param_attr='shared_w',
             )
-            embed_second = layers.embedding(
+            embed_second = paddle.static.nn.embedding(
                 input=second_word,
                 size=[dict_size, embed_size],
                 dtype='float32',
                 param_attr='shared_w',
             )
 
-            embed_third = layers.embedding(
+            embed_third = paddle.static.nn.embedding(
                 input=third_word,
                 size=[dict_size, embed_size],
                 dtype='float32',
                 param_attr='shared_w',
             )
-            embed_forth = layers.embedding(
+            embed_forth = paddle.static.nn.embedding(
                 input=forth_word,
                 size=[dict_size, embed_size],
                 dtype='float32',
@@ -1754,7 +1754,7 @@ def make_nce(self):
             if i == label_word:
                 continue
 
-            emb = layers.embedding(
+            emb = paddle.static.nn.embedding(
                 input=words[i],
                 size=[dict_size, 32],
                 param_attr='emb.w',
diff --git a/test/legacy_test/test_lookup_table_bf16_op.py b/test/legacy_test/test_lookup_table_bf16_op.py
index 6f932f780c977..48cb484f0d81d 100644
--- a/test/legacy_test/test_lookup_table_bf16_op.py
+++ b/test/legacy_test/test_lookup_table_bf16_op.py
@@ -236,7 +236,7 @@ def setUp(self):
             x = paddle.static.data(
                 name='x', shape=self.ids_shape, dtype='int64'
             )
-            self.emb = fluid.layers.embedding(
+            self.emb = paddle.static.nn.embedding(
                 input=x,
                 size=self.w_shape,
                 param_attr=fluid.ParamAttr(
@@ -256,7 +256,7 @@ def test_embedding_weights(self):
         np.testing.assert_array_equal(self.w_fp32, result)
 
     def test_lookup_results(self):
-        lookup_result = convert_uint16_to_float(self.result[1])
+        lookup_result = convert_uint16_to_float(self.result[1].squeeze(-2))
         lookup_ref = _lookup(self.w_fp32, self.ids, self.flat_ids)
         np.testing.assert_array_equal(lookup_result, lookup_ref)
 
diff --git a/test/legacy_test/test_lookup_table_op.py b/test/legacy_test/test_lookup_table_op.py
index cd26f390747ee..04ac09bdce996 100644
--- a/test/legacy_test/test_lookup_table_op.py
+++ b/test/legacy_test/test_lookup_table_op.py
@@ -25,7 +25,6 @@
 
 import paddle
 import paddle.nn.functional as F
-from paddle import fluid
 from paddle.fluid import Program, core, program_guard
 
 
@@ -168,7 +167,7 @@ def test_errors(self):
 
                 def test_Variable():
                     # the input type must be Variable
-                    fluid.layers.embedding(input=input_data, size=(10, 64))
+                    paddle.static.nn.embedding(input=input_data, size=(10, 64))
 
                 self.assertRaises(TypeError, test_Variable)
 
@@ -177,7 +176,7 @@ def test_input_dtype():
                     input = paddle.static.data(
                         name='x', shape=[4, 1], dtype='float32'
                     )
-                    fluid.layers.embedding(input=input, size=(10, 64))
+                    paddle.static.nn.embedding(input=input, size=(10, 64))
 
                 self.assertRaises(TypeError, test_input_dtype)
 
@@ -186,7 +185,7 @@ def test_param_dtype():
                     input2 = paddle.static.data(
                         name='x2', shape=[4, 1], dtype='int64'
                     )
-                    fluid.layers.embedding(
+                    paddle.static.nn.embedding(
                         input=input2, size=(10, 64), dtype='int64'
                     )
 
@@ -195,7 +194,7 @@ def test_param_dtype():
                 input3 = paddle.static.data(
                     name='x3', shape=[4, 1], dtype='int64'
                 )
-                fluid.layers.embedding(
+                paddle.static.nn.embedding(
                     input=input3, size=(10, 64), dtype='float16'
                 )
 
diff --git a/test/legacy_test/test_monitor.py b/test/legacy_test/test_monitor.py
index 128f9bb1dcbda..37fb9cb898a63 100644
--- a/test/legacy_test/test_monitor.py
+++ b/test/legacy_test/test_monitor.py
@@ -61,7 +61,9 @@ def test_dataset_run_with_stat(self):
 
         embs = []
         for x in slots_vars:
-            emb = fluid.layers.embedding(x, is_sparse=True, size=[100001, 4])
+            emb = paddle.static.nn.embedding(
+                x, is_sparse=True, size=[100001, 4]
+            )
             embs.append(emb)
 
         dataset = paddle.distributed.InMemoryDataset()
diff --git a/test/legacy_test/test_regularizer.py b/test/legacy_test/test_regularizer.py
index af7103d704f48..d8add3c3760d1 100644
--- a/test/legacy_test/test_regularizer.py
+++ b/test/legacy_test/test_regularizer.py
@@ -127,7 +127,7 @@ def bow_net(
     This model is from https://github.com/PaddlePaddle/models:
     fluid/PaddleNLP/text_classification/nets.py
     """
-    emb = fluid.layers.embedding(
+    emb = paddle.static.nn.embedding(
         input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim]
     )
     bow = paddle.static.nn.sequence_lod.sequence_pool(
diff --git a/test/legacy_test/test_regularizer_api.py b/test/legacy_test/test_regularizer_api.py
index 415a5d963b1ad..a00dc07022c49 100644
--- a/test/legacy_test/test_regularizer_api.py
+++ b/test/legacy_test/test_regularizer_api.py
@@ -39,7 +39,7 @@ def bow_net(
     This model is from https://github.com/PaddlePaddle/models:
     fluid/PaddleNLP/text_classification/nets.py
     """
-    emb = fluid.layers.embedding(
+    emb = paddle.static.nn.embedding(
         input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim]
     )
     bow = paddle.static.nn.sequence_lod.sequence_pool(
diff --git a/test/legacy_test/test_sgd_op_bf16.py b/test/legacy_test/test_sgd_op_bf16.py
index 76a9819b073c2..9b58c7b00d2ce 100644
--- a/test/legacy_test/test_sgd_op_bf16.py
+++ b/test/legacy_test/test_sgd_op_bf16.py
@@ -342,7 +342,7 @@ def test_sgd(self):
             label = paddle.static.data(
                 name='Y', shape=[-1] + y_shape, dtype='uint16'
             )
-            emb = fluid.layers.embedding(
+            emb = paddle.static.nn.embedding(
                 input=x,
                 size=self.w_shape,
                 param_attr=fluid.ParamAttr(
diff --git a/test/legacy_test/test_weight_decay.py b/test/legacy_test/test_weight_decay.py
index ae85324e9d64a..41bea82c4cd03 100644
--- a/test/legacy_test/test_weight_decay.py
+++ b/test/legacy_test/test_weight_decay.py
@@ -54,7 +54,7 @@ def bow_net(
     This model is from https://github.com/PaddlePaddle/models:
     fluid/PaddleNLP/text_classification/nets.py
     """
-    emb = fluid.layers.embedding(
+    emb = paddle.static.nn.embedding(
         input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim]
     )
     bow = paddle.static.nn.sequence_lod.sequence_pool(
diff --git a/test/legacy_test/transformer_model.py b/test/legacy_test/transformer_model.py
index 14010c06f0b30..03f926c1fb4c0 100644
--- a/test/legacy_test/transformer_model.py
+++ b/test/legacy_test/transformer_model.py
@@ -18,7 +18,6 @@
 
 import paddle
 from paddle import fluid
-from paddle.fluid import layers
 
 pos_enc_param_names = (
     "src_pos_enc_table",
@@ -264,13 +263,13 @@ def prepare_encoder(
 
     This module is used at the bottom of the encoder stacks.
     """
-    src_word_emb = layers.embedding(
+    src_word_emb = paddle.static.nn.embedding(
         src_word,
         size=[src_vocab_size, src_emb_dim],
         padding_idx=src_pad_idx,
         param_attr=paddle.nn.initializer.Normal(0.0, 1.0),
     )
-    src_pos_enc = layers.embedding(
+    src_pos_enc = paddle.static.nn.embedding(
         src_pos,
         size=[src_max_len, src_emb_dim],
         padding_idx=pos_pad_idx,