From ec2c81b2b52309e71e1f0048c7174b92771cfced Mon Sep 17 00:00:00 2001 From: YibinLiu666 <2632839426@qq.com> Date: Sun, 17 Dec 2023 09:50:28 +0000 Subject: [PATCH 1/2] add layer_norm_eps --- python/paddle/nn/layer/transformer.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py index 511e091c71afd6..1f691030e76378 100644 --- a/python/paddle/nn/layer/transformer.py +++ b/python/paddle/nn/layer/transformer.py @@ -779,6 +779,7 @@ class TransformerDecoderLayer(Layer): and post-precess of MHA and FFN sub-layer. Default 0.1 activation (str, optional): The activation function in the feedforward network. Default relu. + layer_norm_eps: the eps value in layer normalization components. Default=1e-5. attn_dropout (float, optional): The dropout probability used in MHA to drop some attention target. If None, use the value of `dropout`. Default None @@ -838,6 +839,7 @@ def __init__( dim_feedforward, dropout=0.1, activation="relu", + layer_norm_eps=1e-5, attn_dropout=None, act_dropout=None, normalize_before=False, @@ -889,9 +891,9 @@ def __init__( self.linear2 = Linear( dim_feedforward, d_model, weight_attrs[2], bias_attr=bias_attrs[2] ) - self.norm1 = LayerNorm(d_model) - self.norm2 = LayerNorm(d_model) - self.norm3 = LayerNorm(d_model) + self.norm1 = LayerNorm(d_model, layer_norm_eps) + self.norm2 = LayerNorm(d_model, layer_norm_eps) + self.norm3 = LayerNorm(d_model, layer_norm_eps) self.dropout1 = Dropout(dropout, mode="upscale_in_train") self.dropout2 = Dropout(dropout, mode="upscale_in_train") self.dropout3 = Dropout(dropout, mode="upscale_in_train") From 00ac467f3904518949afa004efceab5f2810e91b Mon Sep 17 00:00:00 2001 From: YibinLiu666 <2632839426@qq.com> Date: Tue, 19 Dec 2023 02:17:44 +0000 Subject: [PATCH 2/2] fix ci --- python/paddle/nn/layer/transformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py index 1f691030e76378..51b4de0f33a3af 100644 --- a/python/paddle/nn/layer/transformer.py +++ b/python/paddle/nn/layer/transformer.py @@ -779,7 +779,6 @@ class TransformerDecoderLayer(Layer): and post-precess of MHA and FFN sub-layer. Default 0.1 activation (str, optional): The activation function in the feedforward network. Default relu. - layer_norm_eps: the eps value in layer normalization components. Default=1e-5. attn_dropout (float, optional): The dropout probability used in MHA to drop some attention target. If None, use the value of `dropout`. Default None @@ -807,6 +806,7 @@ class TransformerDecoderLayer(Layer): corresponding layer would not have trainable bias parameter. See usage for details in :code:`ParamAttr` . Default: None,which means the default bias parameter property is used. + layer_norm_eps: the eps value in layer normalization components. Default=1e-5. Examples: @@ -839,12 +839,12 @@ def __init__( dim_feedforward, dropout=0.1, activation="relu", - layer_norm_eps=1e-5, attn_dropout=None, act_dropout=None, normalize_before=False, weight_attr=None, bias_attr=None, + layer_norm_eps=1e-5, ): self._config = locals() self._config.pop("self")