From de2612817441c5a08557550e9a14ddb6ad7483a7 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Tue, 14 Jun 2022 06:46:58 +0000 Subject: [PATCH] Add checkpoint and add the check of add_residual when pre_layer_norm is false. --- paddle/fluid/operators/fused/fused_attention_op.cc | 9 +++++++++ paddle/fluid/operators/fused/fused_attention_op.cu | 5 +++++ paddle/fluid/operators/fused/fused_feedforward_op.cc | 8 ++++++++ paddle/fluid/operators/fused/fused_feedforward_op.cu | 5 +++++ 4 files changed, 27 insertions(+) diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc index 6639e184db135..32dbe2b180c61 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cc +++ b/paddle/fluid/operators/fused/fused_attention_op.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" namespace paddle { namespace operators { @@ -656,3 +657,11 @@ REGISTER_OPERATOR(fused_attention, ops::FusedAttentionOp, ops::FusedAttentionGradOpMaker, ops::FusedAttentionGradOpMaker); REGISTER_OPERATOR(fused_attention_grad, ops::FusedAttentionGradOp); + +REGISTER_OP_VERSION(fused_attention) + .AddCheckpoint( + R"ROC( + Add a new attribute [add_residual] )ROC", + paddle::framework::compatible::OpVersionDesc().NewAttr( + "add_residual", "A flag to indicate whether to add residual.", + true)); diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu index 24ae1dab849ab..4f2879787fa3d 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_attention_op.cu @@ -254,6 +254,11 @@ class FusedAttentionOpKernel : public framework::OpKernel { ctx.cuda_device_context(), out_linear_out_data, residual_ptr, out_linear_bias_data, final_out_data, dropout_mask_out_data); } else { + PADDLE_ENFORCE_EQ(add_residual, true, + platform::errors::InvalidArgument( + "Attribute add_residual is expected to be true " + "when pre_layer_norm is false.")); + const U *ln_scale_2_ptr = ln_scale_2 ? ln_scale_2->data() : nullptr; const U *ln_bias_2_ptr = ln_bias_2 ? ln_bias_2->data() : nullptr; T *bias_dropout_residual_out_ptr = diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc index 63be11f2f5cfb..138515b21d917 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cc +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc @@ -368,3 +368,11 @@ REGISTER_OPERATOR(fused_feedforward, ops::FusedFeedForwardOp, ops::FusedFeedForwardOpGradMaker, ops::FusedFeedForwardOpGradMaker); REGISTER_OPERATOR(fused_feedforward_grad, ops::FusedFeedForwardOpGrad); + +REGISTER_OP_VERSION(fused_feedforward) + .AddCheckpoint( + R"ROC( + Add a new attribute [add_residual] )ROC", + paddle::framework::compatible::OpVersionDesc().NewAttr( + "add_residual", "A flag to indicate whether to add residual.", + true)); diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu index dfc39c0f490a6..e7a96354b9c23 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cu +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu @@ -129,6 +129,11 @@ class FusedFeedForwardKernel : public framework::OpKernel { const T* residual_ptr = add_residual ? x.data() : nullptr; if (!pre_layer_norm) { + PADDLE_ENFORCE_EQ(add_residual, true, + platform::errors::InvalidArgument( + "Attribute add_residual is expected to be true " + "when pre_layer_norm is false.")); + fused_dropout_layernorm_helper.LayernormResidualDropoutBias( ctx, linear2_out.data(), residual_ptr, linear2_bias_ptr, ln2_scale_ptr, ln2_bias_ptr, dropout2_out->data(),