repacle embedding in fluid with 2.0 version (#55757)

* replace embedding * replace sparse_embedding * fix some bugs * del embedding * repalce layers.embedding * fix type error
PaddlePaddle · Aug 4, 2023 · 2d91a9b · 2d91a9b
1 parent 274e5e5
commit 2d91a9b
Show file tree

Hide file tree

Showing 48 changed files with 141 additions and 301 deletions.
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
@@ -57,176 +57,10 @@
 
 
 __all__ = [
-    'embedding',
     'autoincreased_step_counter',
 ]
 
 
-@deprecated(since="2.0.0", update_to="paddle.nn.functional.embedding")
-def embedding(
-    input,
-    size,
-    is_sparse=False,
-    is_distributed=False,
-    padding_idx=None,
-    param_attr=None,
-    dtype='float32',
-):
-    r"""
-    :api_attr: Static Graph
-
-    **WARNING:** This OP will be deprecated in a future release. This OP requires the
-    last dimension of Tensor shape must be equal to 1. It is recommended to use
-    fluid. :ref:`api_fluid_embedding` .
-
-    The operator is used to lookup embeddings vector of ids provided by :attr:`input` .
-    It automatically constructs a 2D embedding matrix based on the
-    input :attr:`size` (vocab_size, emb_size) and :attr:`dtype` .
-
-    This OP requires the last dimension of Tensor shape must be equal to 1. The shape
-    of output Tensor is generated by replacing the last dimension of the input Tensor shape
-    with emb_size.
-
-    **Note:** The id in :attr:`input` must satisfy :math:`0 =< id < size[0]` ,
-    otherwise the program will throw an exception and exit.
-
-    .. code-block:: text
-
-        Case 1:
-
-        input is a Tensor. padding_idx = -1
-            input.data = [[[1], [3]], [[2], [4]], [[4], [127]]]
-            input.shape = [3, 2, 1]
-        Given size = [128, 16]
-        output is a Tensor:
-            out.shape = [3, 2, 16]
-            out.data = [[[0.129435295, 0.244512452, ..., 0.436322452],
-                        [0.345421456, 0.524563927, ..., 0.144534654]],
-
-                        [[0.345249859, 0.124939536, ..., 0.194353745],
-                        [0.945345345, 0.435394634, ..., 0.435345365]],
-
-                        [[0.945345345, 0.435394634, ..., 0.435345365],
-                        [0.0,         0.0,         ..., 0.0        ]]]  # padding data
-        The input padding_idx is less than 0, it is automatically converted to padding_idx = -1 + 128 = 127
-        It will pad all-zero data when ids is 127.
-
-        Case 2:
-
-        input is a LoDTensor with 1-level LoD. padding_idx = 0
-            input.lod = [[2, 3]]
-            input.data = [[1], [3], [2], [4], [0]]
-            input.shape = [5, 1]
-        Given size = [128, 16]
-        output is a LoDTensor:
-            out.lod = [[2, 3]]
-            out.shape = [5, 16]
-            out.data = [[0.129435295, 0.244512452, ..., 0.436322452],
-                        [0.345421456, 0.524563927, ..., 0.144534654],
-                        [0.345249859, 0.124939536, ..., 0.194353745],
-                        [0.945345345, 0.435394634, ..., 0.435345365],
-                        [0.0,         0.0,         ..., 0.0        ]]  # padding data
-        It will pad all-zero data when ids is 0.
-
-    Args:
-        input(Variable): A Tensor or LoDTensor with type int64, which contains the id information.
-            The last dimension of Tensor shape must be equal to 1. The value of the input id should
-            satisfy :math:`0<= id < size[0]` .
-        size(tuple|list): The shape of lookup table parameter. It should have two elements which
-            indicates the size of the dictionary of embeddings and the size of each embedding vector respectively.
-        is_sparse(bool): The flag indicating whether to use sparse update. This parameter only
-            affects the performance of the backwards gradient update. It is recommended to set
-            True because sparse update is faster. But some optimizer does not support sparse update,
-            such as :ref:`api_fluid_optimizer_AdadeltaOptimizer` , :ref:`api_fluid_optimizer_AdamaxOptimizer` ,
-            :ref:`api_fluid_optimizer_DecayedAdagradOptimizer` , :ref:`api_fluid_optimizer_FtrlOptimizer` ,
-            :ref:`api_fluid_optimizer_LambOptimizer` and :ref:`api_fluid_optimizer_LarsMomentumOptimizer` .
-            In these case, is_sparse must be False. Default: False.
-        is_distributed(bool): Whether to store the embedding matrix in a distributed manner. Only used
-            in multi-machine distributed CPU training. Default: False.
-        padding_idx(int|long|None): padding_idx needs to be in the interval [-vocab_size, vocab_size).
-            If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
-            to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup
-            encounters :math:`padding\_idx` in id. And the padding data will not be updated while training.
-            If set None, it makes no effect to output. Default: None.
-        param_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the
-            default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` . In addition,
-            user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter.
-            The local word vector needs to be transformed into numpy format, and the shape of local word
-            vector should be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer`
-            is used to load custom or pre-trained word vectors. See code example 2 for details.
-        dtype(str|core.VarDesc.VarType): It refers to the data type of output Tensor.
-            It must be float32 or float64. Default: float32.
-
-    Returns:
-        Variable: Embedding Tensor or LoDTensor mapped by input. The data type is the same as :attr:`dtype` .
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import numpy as np
-          import paddle
-          paddle.enable_static()
-
-          data = paddle.static.data(name='x', shape=[None, 1], dtype='int64')
-
-          # example 1
-          emb_1 = paddle.static.nn.embedding(input=data, size=[128, 64])
-
-          # example 2: load custom or pre-trained word vectors
-          weight_data = np.random.random(size=(128, 100))  # word vectors with numpy format
-          w_param_attrs = fluid.ParamAttr(
-              name="emb_weight",
-              learning_rate=0.5,
-              initializer=paddle.nn.initializer.Assign(weight_data),
-              trainable=True)
-          emb_2 = fluid.layers.embedding(input=data, size=(128, 100), param_attr=w_param_attrs, dtype='float32')
-    """
-
-    helper = LayerHelper('embedding', **locals())
-    check_variable_and_dtype(
-        input, 'input', ['int64'], 'fluid.layers.embedding'
-    )
-    check_dtype(
-        dtype,
-        'dtype',
-        ['uint16', 'float16', 'float32', 'float64'],
-        'fluid.layers.embedding',
-    )
-
-    if is_distributed:
-        is_distributed = False
-        warnings.warn(
-            "is_distributed is go out of use, `paddle.static.nn.sparse_embedding` is your needed"
-        )
-
-    remote_prefetch = True if is_sparse else False
-
-    w = helper.create_parameter(
-        attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False
-    )
-    tmp = helper.create_variable_for_type_inference(dtype)
-    padding_idx = (
-        -1
-        if padding_idx is None
-        else padding_idx
-        if padding_idx >= 0
-        else (size[0] + padding_idx)
-    )
-    helper.append_op(
-        type='lookup_table',
-        inputs={'Ids': input, 'W': w},
-        outputs={'Out': tmp},
-        attrs={
-            'is_sparse': is_sparse,
-            'is_distributed': is_distributed,
-            'remote_prefetch': remote_prefetch,
-            'padding_idx': padding_idx,
-        },
-    )
-    return tmp
-
-
 def autoincreased_step_counter(counter_name=None, begin=1, step=1):
     """
     :api_attr: Static Graph

diff --git a/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/__init__.py b/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/__init__.py
@@ -157,7 +157,7 @@ def get_sparse_attrs():
 
                 if len(dist_varnames) != 0:
                     raise ValueError(
-                        "GeoStrategy can not support large scale embeding now, please use fluid.layers.embedding"
+                        "GeoStrategy can not support large scale embeding now, please use paddle.static.nn.embedding"
                     )
 
                 init_attrs = []

diff --git a/python/paddle/incubate/distributed/fleet/parameter_server/pslib/__init__.py b/python/paddle/incubate/distributed/fleet/parameter_server/pslib/__init__.py
@@ -1124,7 +1124,7 @@ class fleet_embedding:
     Example:
         .. code-block:: python
           with fleet_embedding(click_name=label.name):
-              emb = fluid.layers.embedding(
+              emb = paddle.static.nn.embedding(
                   input=var,
                   size=[-1, 11],
                   is_sparse=True,
@@ -1134,7 +1134,6 @@ class fleet_embedding:
 
     def __init__(self, click_name, scale_sparse_grad=True):
         """Init."""
-        # self.origin_emb = fluid.layers.embedding
         self.origin_emb_v2 = paddle.static.nn.embedding
         # if user uses cvm layer after embedding, click_name can be None
         self.click_name = "" if click_name is None else click_name
@@ -1144,15 +1143,13 @@ def __init__(self, click_name, scale_sparse_grad=True):
 
     def __enter__(self):
         """Enter."""
-        # fluid.layers.embedding = _fleet_embedding
         paddle.static.nn.embedding = _fleet_embedding_v2
         FLEET_GLOBAL_DICT["cur_accessor"] = self.accessor
         FLEET_GLOBAL_DICT["click_name"] = self.click_name
         FLEET_GLOBAL_DICT["scale_sparse_grad"] = self.scale_sparse_grad
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         """Exit."""
-        # fluid.layers.embedding = self.origin_emb
         paddle.static.nn.embedding = self.origin_emb_v2
         FLEET_GLOBAL_DICT["cur_accessor"] = ""
         FLEET_GLOBAL_DICT["click_name"] = ""

diff --git a/test/auto_parallel/test_dist_embedding.py b/test/auto_parallel/test_dist_embedding.py
@@ -31,13 +31,26 @@ def make_program_lookup_table_v1_mp_dp():
             name='src_ids', shape=[12, 512, 1], dtype='int64'
         )
         src_ids.stop_gradient = True
-        emb_out = paddle.fluid.layers.embedding(
-            input=src_ids,
-            size=[64, 128],
-            param_attr=paddle.fluid.ParamAttr(name="emb_weight"),
-            dtype="float32",
-            is_sparse=False,
+
+        emb_out = block.create_var(name='emb_out', dtype='float32')
+        w = paddle.create_parameter(
+            attr=paddle.fluid.ParamAttr(name="emb_weight"),
+            shape=[64, 128],
+            dtype='float32',
+            is_bias=False,
+        )
+        block.append_op(
+            type='lookup_table',
+            outputs={'Out': emb_out},
+            inputs={'Ids': src_ids, 'W': w},
+            attrs={
+                'is_sparse': False,
+                'is_distributed': False,
+                'remote_prefetch': False,
+                'padding_idx': None,
+            },
         )
+
         loss = paddle.mean(emb_out)
 
         auto.shard_tensor(

diff --git a/test/book/notest_understand_sentiment.py b/test/book/notest_understand_sentiment.py
@@ -31,7 +31,7 @@
 def convolution_net(
     data, label, input_dim, class_dim=2, emb_dim=32, hid_dim=32
 ):
-    emb = fluid.layers.embedding(
+    emb = paddle.static.nn.embedding(
         input=data, size=[input_dim, emb_dim], is_sparse=True
     )
     conv_3 = nets.sequence_conv_pool(

diff --git a/test/book/test_recommender_system.py b/test/book/test_recommender_system.py
@@ -25,7 +25,7 @@
 
 import paddle
 from paddle import fluid
-from paddle.fluid import framework, layers
+from paddle.fluid import framework
 from paddle.fluid.executor import Executor
 from paddle.fluid.optimizer import SGDOptimizer
 
@@ -44,7 +44,7 @@ def get_usr_combined_features():
 
     uid = paddle.static.data(name='user_id', shape=[-1, 1], dtype='int64')
 
-    usr_emb = layers.embedding(
+    usr_emb = paddle.static.nn.embedding(
         input=uid,
         dtype='float32',
         size=[USR_DICT_SIZE, 32],
@@ -60,7 +60,7 @@ def get_usr_combined_features():
         name='gender_id', shape=[-1, 1], dtype='int64'
     )
 
-    usr_gender_emb = layers.embedding(
+    usr_gender_emb = paddle.static.nn.embedding(
         input=usr_gender_id,
         size=[USR_GENDER_DICT_SIZE, 16],
         param_attr='gender_table',
@@ -72,7 +72,7 @@ def get_usr_combined_features():
     USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
     usr_age_id = paddle.static.data(name='age_id', shape=[-1, 1], dtype="int64")
 
-    usr_age_emb = layers.embedding(
+    usr_age_emb = paddle.static.nn.embedding(
         input=usr_age_id,
         size=[USR_AGE_DICT_SIZE, 16],
         is_sparse=IS_SPARSE,
@@ -84,7 +84,7 @@ def get_usr_combined_features():
     USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
     usr_job_id = paddle.static.data(name='job_id', shape=[-1, 1], dtype="int64")
 
-    usr_job_emb = layers.embedding(
+    usr_job_emb = paddle.static.nn.embedding(
         input=usr_job_id,
         size=[USR_JOB_DICT_SIZE, 16],
         param_attr='job_table',
@@ -109,7 +109,7 @@ def get_mov_combined_features():
 
     mov_id = paddle.static.data(name='movie_id', shape=[-1, 1], dtype='int64')
 
-    mov_emb = layers.embedding(
+    mov_emb = paddle.static.nn.embedding(
         input=mov_id,
         dtype='float32',
         size=[MOV_DICT_SIZE, 32],
@@ -125,12 +125,12 @@ def get_mov_combined_features():
         name='category_id', shape=[-1, 1], dtype='int64', lod_level=1
     )
 
-    mov_categories_emb = layers.embedding(
+    mov_categories_emb = paddle.static.nn.embedding(
         input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE
     )
 
     mov_categories_hidden = paddle.static.nn.sequence_lod.sequence_pool(
-        input=mov_categories_emb, pool_type="sum"
+        input=mov_categories_emb.squeeze(-2), pool_type="sum"
     )
 
     MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
@@ -139,12 +139,12 @@ def get_mov_combined_features():
         name='movie_title', shape=[-1, 1], dtype='int64', lod_level=1
     )
 
-    mov_title_emb = layers.embedding(
+    mov_title_emb = paddle.static.nn.embedding(
         input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE
     )
 
     mov_title_conv = nets.sequence_conv_pool(
-        input=mov_title_emb,
+        input=mov_title_emb.squeeze(-2),
         num_filters=32,
         filter_size=3,
         act="tanh",

diff --git a/test/book/test_word2vec_book.py b/test/book/test_word2vec_book.py
@@ -58,28 +58,28 @@ def train(
     IS_SPARSE = is_sparse
 
     def __network__(words):
-        embed_first = fluid.layers.embedding(
+        embed_first = paddle.static.nn.embedding(
             input=words[0],
             size=[dict_size, EMBED_SIZE],
             dtype='float32',
             is_sparse=IS_SPARSE,
             param_attr='shared_w',
         )
-        embed_second = fluid.layers.embedding(
+        embed_second = paddle.static.nn.embedding(
             input=words[1],
             size=[dict_size, EMBED_SIZE],
             dtype='float32',
             is_sparse=IS_SPARSE,
             param_attr='shared_w',
         )
-        embed_third = fluid.layers.embedding(
+        embed_third = paddle.static.nn.embedding(
             input=words[2],
             size=[dict_size, EMBED_SIZE],
             dtype='float32',
             is_sparse=IS_SPARSE,
             param_attr='shared_w',
         )
-        embed_forth = fluid.layers.embedding(
+        embed_forth = paddle.static.nn.embedding(
             input=words[3],
             size=[dict_size, EMBED_SIZE],
             dtype='float32',

diff --git a/test/ipu/distributed/test_dist_pod128_sample.py b/test/ipu/distributed/test_dist_pod128_sample.py
@@ -59,7 +59,7 @@ def TestDistTraining():
         with paddle.static.program_guard(main_prog, startup_prog):
             x = paddle.static.data(name="x", shape=[3, 2, 1], dtype='int64')
             with paddle.static.ipu_shard_guard(index=0, stage=0):
-                out = paddle.fluid.layers.embedding(x, **attrs)
+                out = paddle.static.nn.embedding(x, **attrs)
             with paddle.static.ipu_shard_guard(index=1, stage=1):
                 loss = paddle.mean(out)
             opt = paddle.optimizer.Adam(learning_rate=1e-1)

diff --git a/test/ipu/distributed/test_dist_sample.py b/test/ipu/distributed/test_dist_sample.py
@@ -77,7 +77,7 @@ def Test(use_dist, file_name):
         with paddle.static.program_guard(main_prog, startup_prog):
             x = paddle.static.data(name="x", shape=[3, 2, 1], dtype='int64')
 
-            out = paddle.fluid.layers.embedding(x, **attrs)
+            out = paddle.static.nn.embedding(x, **attrs)
             loss = paddle.mean(out)
             opt = paddle.optimizer.Adam(learning_rate=1e-1)
             opt.minimize(loss)