From f450797421ba21ae984f21b53418d362d74ef7e3 Mon Sep 17 00:00:00 2001
From: WangXi <wangxi16@baidu.com>
Date: Thu, 28 Apr 2022 10:32:19 +0800
Subject: [PATCH 01/10] fix fused_multi_transformer compile failed in cuda arch
 < sm53 (#42315)

---
 paddle/fluid/operators/fused/fused_multi_transformer_op.cu | 5 +++++
 1 file changed, 5 insertions(+)
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
index f4a5319a68caa..e38ac9a0ad2da 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
@@ -534,6 +534,8 @@ template <typename T, int Dh, int THREADS_PER_KEY, int THREADS_PER_VALUE,
           int THREADS_PER_BLOCK>
 __global__ void masked_multihead_attention_kernel(
     Masked_multihead_attention_params<T> params) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+
   static_assert(Dh % THREADS_PER_KEY == 0, "");
   static_assert(Dh % THREADS_PER_VALUE == 0, "");
 
@@ -821,6 +823,9 @@ __global__ void masked_multihead_attention_kernel(
     printf("\n");
   }
 #endif
+#else
+  assert(false);
+#endif
 }
 
 template <typename T>

From 62c0304b21f14b6c85e5f2c8439cc2e87f25e785 Mon Sep 17 00:00:00 2001
From: Aganlengzi <aganlengzi@gmail.com>
Date: Thu, 28 Apr 2022 10:32:39 +0800
Subject: [PATCH 02/10] [CustomDevice]change import way of unpublished file in
 op_test test=allcases (#42285)

* test op_test test=allcases

* fix

* avoid copy many same file

* fix for win

* test PYTHONPATH

* change path adding way

* fix win

* use old way

* use old way test=allcase

* use old way test=allcase
---
 paddle/scripts/paddle_build.sh                 |  4 ++++
 python/paddle/fluid/tests/unittests/op_test.py | 11 +++++++----
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 9c5eef6292581..5f0a70dc0e69f 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -752,6 +752,8 @@ function run_linux_cpu_test() {
     pip install hypothesis
     pip install ${PADDLE_ROOT}/build/python/dist/*whl
     cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/op_test.py ${PADDLE_ROOT}/build/python
+    cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/testsuite.py ${PADDLE_ROOT}/build/python
+    cp -r ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/white_list ${PADDLE_ROOT}/build/python
     ut_total_startTime_s=`date +%s`
     if [ ${WITH_TESTING:-ON} == "ON" ] ; then
     cat <<EOF
@@ -2404,6 +2406,8 @@ function parallel_test() {
     pip install hypothesis
     pip install ${PADDLE_ROOT}/build/python/dist/*whl
     cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/op_test.py ${PADDLE_ROOT}/build/python
+    cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/testsuite.py ${PADDLE_ROOT}/build/python
+    cp -r ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/white_list ${PADDLE_ROOT}/build/python
     ut_total_startTime_s=`date +%s`
     if [ "$WITH_CINN" == "ON" ];then
         parallel_test_base_cinn
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index a2441b28bf96d..13c72bedefa8e 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import os
+import sys
 import unittest
 import warnings
 import numpy as np
@@ -37,20 +38,22 @@
 from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
 from paddle.fluid.framework import Program, OpProtoHolder, Variable, _current_expected_place
-from paddle.fluid.tests.unittests.testsuite import (
+from paddle.fluid import unique_name
+from paddle.fluid.dygraph.dygraph_to_static.utils import parse_arg_and_kwargs
+
+sys.path.append(os.path.abspath(os.path.dirname(__file__)))
+from testsuite import (
     create_op,
     set_input,
     append_input_output,
     append_loss_ops, )
-from paddle.fluid import unique_name
-from paddle.fluid.tests.unittests.white_list import (
+from white_list import (
     op_accuracy_white_list,
     check_shape_white_list,
     compile_vs_runtime_white_list,
     no_check_set_white_list,
     op_threshold_white_list,
     no_grad_set_white_list, )
-from paddle.fluid.dygraph.dygraph_to_static.utils import parse_arg_and_kwargs
 
 # For switch new eager mode globally
 g_is_in_eager = _in_eager_without_dygraph_check()

From 9fd2c546c61e692da1f516ba125aa48163297271 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <85323580+Liu-xiandong@users.noreply.github.com>
Date: Thu, 28 Apr 2022 10:54:40 +0800
Subject: [PATCH 03/10] [KP] fix bug when phi kernel is *_raw (#42113)

* [KP] fix bug when phi kernel is *_raw

* modify the static graph

* delete useless comment

* delete the phi multiply kernel case

* add VLOG(3) message

* add VLOG(3) message

* fix static graph error in phi

* fix bug in tranform model

* modify the comment

* delete useless code

* fix CI bug

* fix CI bug
---
 paddle/fluid/framework/operator.cc           | 100 ++++++++++++++-----
 paddle/fluid/imperative/prepared_operator.cc |  20 ++--
 2 files changed, 88 insertions(+), 32 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index e17a5d55f1f0a..0c22321996b8f 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -1281,6 +1278,12 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     dev_ctx = pool.Get(kernel_type_->place_);
   }
 
+// TODO(Liu-xiandong): Now we are using too much if-else and hard code in XPU
+// device, it's ugly, and we will refactor in the future.
+#if defined(PADDLE_WITH_XPU_KP)
+  bool use_phi_xpu_kp = false;
+#endif
+
   // TODO(chenweihang): Now we are still reusing a lot of the original fluid
   // implementation, this is a gradual replacement process
   // TODO(chenweihang): in the first phase of project, we only support CPU, CUDA
@@ -1299,6 +1302,45 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       dev_ctx = pool.Get(kernel_type_->place_);
 
       pt_kernel_name = kernel_signature_->name;
+// NOTE(Liu-xiandong): The register kernel used KP have library_type[KP],
+// But the default library_type is Plain, so we need to modify the
+// library_type here, otherwise it can't work.
+#ifdef PADDLE_WITH_XPU_KP
+      if (paddle::platform::is_xpu_place(kernel_type_->place_)) {
+        bool use_xpu_kp_kernel_rt =
+            FLAGS_run_kp_kernel &&
+            paddle::platform::is_xpu_kp_support_op(type_, *kernel_type_);
+        bool use_xpu_kp_kernel_debug =
+            paddle::platform::is_in_xpu_kpwhite_list(type_);
+        if (use_xpu_kp_kernel_rt) {
+          VLOG(3) << "phi xpu_kp using rt mode in static graph";
+        }
+        if (use_xpu_kp_kernel_debug) {
+          VLOG(3) << "phi xpu_kp using debug mode in static graph";
+        }
+        bool is_xpu_kp_support =
+            (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
+        if (is_xpu_kp_support) {
+          auto expected_kernel_key_library_type = kernel_type_->library_type_;
+          kernel_type_->library_type_ = LibraryType::kKP;
+          VLOG(3) << "modifing XPU KP kernel in static graph: "
+                  << pt_kernel_name
+                  << ", using_kernel_key:" << *kernel_type_.get();
+          auto try_pt_kernel_key =
+              TransOpKernelTypeToPhiKernelKey(*kernel_type_.get());
+          if (!phi::KernelFactory::Instance().HasKernel(pt_kernel_name,
+                                                        try_pt_kernel_key)) {
+            kernel_type_->library_type_ = expected_kernel_key_library_type;
+            VLOG(3) << "modify XPU KP kernel in static graph: "
+                    << pt_kernel_name << " is failed " << *kernel_type_.get();
+          } else {
+            use_phi_xpu_kp = true;
+            VLOG(3) << "modify XPU KP kernel in static graph: "
+                    << pt_kernel_name << " is succeed " << *kernel_type_.get();
+          }
+        }
+      }
+#endif
       pt_kernel_key = TransOpKernelTypeToPhiKernelKey(*kernel_type_.get());
       pt_kernel_.reset(
           new phi::Kernel(phi::KernelFactory::Instance().SelectKernel(
@@ -1314,9 +1356,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       }
     } else {
       pt_kernel_name = kernel_signature_->name;
-// NOTE(Liu-xiandong): The register kernel used KP have library_type[KP],
-// But the default library_type is Plain, so we need to modify the
-// library_type here, otherwise it can't work.
+// NOTE(Liu-xiandong):In my ctest, this branch do not be executed,
+// I can't understand it, it's really confusing.
+// But we still need to keep this to avoid errors.
 #ifdef PADDLE_WITH_XPU_KP
       if (paddle::platform::is_xpu_place(kernel_type_->place_)) {
         bool use_xpu_kp_kernel_rt =
@@ -1335,15 +1377,20 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
         if (is_xpu_kp_support) {
           auto expected_kernel_key_library_type = kernel_type_->library_type_;
           kernel_type_->library_type_ = LibraryType::kKP;
-          VLOG(3) << "modifing XPU KP kernel in static graph: " << type_
+          VLOG(3) << "modifing XPU KP kernel in static graph: "
+                  << pt_kernel_name
                   << ", using_kernel_key:" << *kernel_type_.get();
           auto try_pt_kernel_key =
               TransOpKernelTypeToPhiKernelKey(*kernel_type_.get());
           if (!phi::KernelFactory::Instance().HasKernel(pt_kernel_name,
                                                         try_pt_kernel_key)) {
             kernel_type_->library_type_ = expected_kernel_key_library_type;
-            VLOG(3) << "modify XPU KP kernel in static graph: " << type_
-                    << " is failed " << *kernel_type_.get();
+            VLOG(3) << "modify XPU KP kernel in static graph: "
+                    << pt_kernel_name << " is failed " << *kernel_type_.get();
+          } else {
+            use_phi_xpu_kp = true;
+            VLOG(3) << "modify XPU KP kernel in static graph: "
+                    << pt_kernel_name << " is succeed " << *kernel_type_.get();
           }
         }
       }
@@ -1360,11 +1407,25 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
             !paddle::platform::is_xpu_support_op(type_, *kernel_type_.get()) ||
         paddle::platform::is_in_xpu_black_list(type_);
 #endif
+#ifdef PADDLE_WITH_XPU_KP
+    bool use_xpu_kp_kernel_rt =
+        paddle::platform::is_xpu_place(kernel_type_->place_) &&
+        FLAGS_run_kp_kernel &&
+        paddle::platform::is_xpu_kp_support_op(type_, *kernel_type_);
+    bool use_xpu_kp_kernel_debug =
+        paddle::platform::is_xpu_place(kernel_type_->place_) &&
+        paddle::platform::is_in_xpu_kpwhite_list(type_);
+    bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
+#endif
+
     if (pt_kernel_->IsValid()
 #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
         && !is_xpu_unsupport
 #endif
-        ) {
+#if defined(PADDLE_WITH_XPU_KP)
+        && (!is_xpu_unsupport || use_phi_xpu_kp)
+#endif
+            ) {
       run_phi_kernel_ = true;
     } else {
       auto& all_op_kernels = AllOpKernels();
@@ -1374,15 +1435,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 // we need to select the heterogeneous kernel in fluid, but the kernel
 // registered in KP use library_type[KP], we need to modify it.
 #ifdef PADDLE_WITH_XPU_KP
-      bool use_xpu_kp_kernel_rt =
-          paddle::platform::is_xpu_place(kernel_type_->place_) &&
-          FLAGS_run_kp_kernel &&
-          paddle::platform::is_xpu_kp_support_op(type_, *kernel_type_);
-      bool use_xpu_kp_kernel_debug =
-          paddle::platform::is_xpu_place(kernel_type_->place_) &&
-          paddle::platform::is_in_xpu_kpwhite_list(type_);
-      bool is_xpu_kp_support =
-          (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
       if (is_xpu_kp_support) {
         kernel_type_->library_type_ = LibraryType::kKP;
       }
@@ -1609,7 +1661,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
       (kernel_iter == kernels.end() ||
        !paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
        paddle::platform::is_in_xpu_black_list(type_))) {
-    VLOG(3) << "missing XPU kernel: " << type_
+    VLOG(3) << "fluid missing XPU kernel: " << type_
             << ", expected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
     expected_kernel_key.place_ = platform::CPUPlace();
@@ -1625,10 +1677,10 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
     bool use_xpu_kp_kernel_debug =
         paddle::platform::is_in_xpu_kpwhite_list(type_);
     if (use_xpu_kp_kernel_rt) {
-      VLOG(3) << "xpu_kp using rt mode ";
+      VLOG(3) << "fluid xpu_kp using rt mode ";
     }
     if (use_xpu_kp_kernel_debug) {
-      VLOG(3) << "xpu_kp using debug mode ";
+      VLOG(3) << "fluid xpu_kp using debug mode ";
     }
     bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
     if (is_xpu_kp_support) {
@@ -1645,7 +1697,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
         expected_kernel_key.place_ = platform::CPUPlace();
         kernel_iter = kernels.find(expected_kernel_key);
       } else {
-        VLOG(3) << "using XPU KP kernel: " << type_
+        VLOG(3) << "fluid using XPU KP kernel: " << type_
                 << ", using_kernel_key:" << expected_kernel_key;
       }
     }
@@ -1654,7 +1706,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
          paddle::platform::is_in_xpu_black_list(type_));
     if (!is_xpu_kp_support &&
         (kernel_iter == kernels.end() || is_xpu_unsupport)) {
-      VLOG(3) << "missing XPU kernel: " << type_
+      VLOG(3) << "fluid missing XPU kernel: " << type_
               << ", expected_kernel_key:" << expected_kernel_key
               << ", fallbacking to CPU one!";
       expected_kernel_key.place_ = platform::CPUPlace();
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index bf69f6cf5ac9d..38180ba963c38 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -233,14 +233,18 @@ PreparedOp PrepareImpl(
         auto expected_kernel_key_library_type =
             expected_kernel_key.library_type_;
         expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP;
-        VLOG(3) << "modifing XPU KP kernel: " << op.Type()
+        VLOG(3) << "modifing XPU KP kernel: " << pt_kernel_name
                 << ", using_kernel_key:" << expected_kernel_key;
+
         phi::KernelKey try_pt_kernel_key =
             TransOpKernelTypeToPhiKernelKey(expected_kernel_key);
         if (!phi_kernel_factory.HasKernel(pt_kernel_name, try_pt_kernel_key)) {
           expected_kernel_key.library_type_ = expected_kernel_key_library_type;
-          VLOG(3) << "modify XPU KP kernel: " << op.Type() << " is failed "
-                  << expected_kernel_key;
+          VLOG(3) << "modify XPU KP kernel: " << pt_kernel_name
+                  << " in dynamic graph is failed " << expected_kernel_key;
+        } else {
+          VLOG(3) << "modify XPU KP kernel: " << pt_kernel_name
+                  << " in dynamic graph is succeed " << expected_kernel_key;
         }
       }
     }
@@ -332,7 +336,7 @@ PreparedOp PrepareImpl(
 #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
   if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
       (kernel_iter == kernels.end() || is_xpu_unsupport)) {
-    VLOG(3) << "missing XPU kernel: " << op.Type()
+    VLOG(3) << "fluid missing XPU kernel: " << op.Type()
             << ", expected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
     expected_kernel_key.place_ = platform::CPUPlace();
@@ -343,20 +347,20 @@ PreparedOp PrepareImpl(
 #ifdef PADDLE_WITH_XPU_KP
   if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) {
     if (use_xpu_kp_kernel_rt) {
-      VLOG(3) << "xpu_kp using rt mode ";
+      VLOG(3) << "fluid xpu_kp using rt mode ";
     }
     if (use_xpu_kp_kernel_debug) {
-      VLOG(3) << "xpu_kp using debug mode ";
+      VLOG(3) << "fluid xpu_kp using debug mode ";
     }
     if (is_xpu_kp_support) {
       expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP;
       kernel_iter = kernels.find(expected_kernel_key);
-      VLOG(3) << "using XPU KP kernel: " << op.Type()
+      VLOG(3) << "using fluid XPU KP kernel: " << op.Type()
               << ", using_kernel_key:" << expected_kernel_key;
     }
     if (!is_xpu_kp_support &&
         (kernel_iter == kernels.end() || is_xpu_unsupport)) {
-      VLOG(3) << "missing XPU kernel: " << op.Type()
+      VLOG(3) << "fluid missing XPU kernel: " << op.Type()
               << ", expected_kernel_key:" << expected_kernel_key
               << ", fallbacking to CPU one!";
       expected_kernel_key.place_ = platform::CPUPlace();

From c7a258fe8c1c9763d485069abbe4ba546a9cb994 Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Thu, 28 Apr 2022 10:56:21 +0800
Subject: [PATCH 04/10] fix PIL sample mode deprecated warning (#42307)

* fix PIL sample mode deprecated warning

* compatible with old pil version
---
 .../vision/transforms/functional_pil.py       | 27 +++++++++++++------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/python/paddle/vision/transforms/functional_pil.py b/python/paddle/vision/transforms/functional_pil.py
index b3ff37d7ea3bb..32f65fa1f846f 100644
--- a/python/paddle/vision/transforms/functional_pil.py
+++ b/python/paddle/vision/transforms/functional_pil.py
@@ -32,14 +32,25 @@
     Sequence = collections.abc.Sequence
     Iterable = collections.abc.Iterable
 
-_pil_interp_from_str = {
-    'nearest': Image.NEAREST,
-    'bilinear': Image.BILINEAR,
-    'bicubic': Image.BICUBIC,
-    'box': Image.BOX,
-    'lanczos': Image.LANCZOS,
-    'hamming': Image.HAMMING
-}
+try:
+    # PIL version >= "9.1.0"
+    _pil_interp_from_str = {
+        'nearest': Image.Resampling.NEAREST,
+        'bilinear': Image.Resampling.BILINEAR,
+        'bicubic': Image.Resampling.BICUBIC,
+        'box': Image.Resampling.BOX,
+        'lanczos': Image.Resampling.LANCZOS,
+        'hamming': Image.Resampling.HAMMING
+    }
+except:
+    _pil_interp_from_str = {
+        'nearest': Image.NEAREST,
+        'bilinear': Image.BILINEAR,
+        'bicubic': Image.BICUBIC,
+        'box': Image.BOX,
+        'lanczos': Image.LANCZOS,
+        'hamming': Image.HAMMING
+    }
 
 __all__ = []
 

From acbb5dbee8ce170bcc3c12e6819206f063438af5 Mon Sep 17 00:00:00 2001
From: ronnywang <524019753@qq.com>
Date: Thu, 28 Apr 2022 11:54:39 +0800
Subject: [PATCH 05/10] [CustomDevice] add amp support (#42035)

---
 paddle/fluid/imperative/amp_auto_cast.cc       | 1 +
 python/paddle/fluid/dygraph/amp/auto_cast.py   | 9 +++++++--
 python/paddle/fluid/dygraph/amp/loss_scaler.py | 5 +++--
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index 7d60b7d26f3fb..3f6863d642cc8 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -220,6 +220,7 @@ inline bool NeedCast(const std::shared_ptr<VarType>& var) {
       paddle::platform::is_cuda_pinned_place(place) ||
       paddle::platform::is_xpu_place(place) ||
       paddle::platform::is_mlu_place(place) ||
+      paddle::platform::is_custom_place(place) ||
       paddle::platform::is_npu_place(place) ||
       paddle::platform::is_npu_pinned_place(place)) {
     // CudaPinndePlace is added for varbase created by dataloader
diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
index f7d4be7ee6e3c..5da5dbbd7bdfc 100644
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -276,9 +276,10 @@ def amp_guard(enable=True,
     if enable and not (tracer._expected_place.is_gpu_place() or
                        tracer._expected_place.is_xpu_place() or
                        tracer._expected_place.is_mlu_place() or
-                       tracer._expected_place.is_npu_place()):
+                       tracer._expected_place.is_npu_place() or
+                       tracer._expected_place.is_custom_place()):
         warnings.warn(
-            'amp_guard can only be enabled on CUDAPlace, XPUPlace, MLUPlace, and NPUPlace, current place is %s, so it makes no effect.'
+            'amp_guard can only be enabled on CUDAPlace, XPUPlace, MLUPlace, NPUPlace, and CustomPlace, current place is %s, so it makes no effect.'
             % tracer._expected_place)
         enable = False
     # For npu:
@@ -293,6 +294,10 @@ def amp_guard(enable=True,
     if tracer._expected_place.is_mlu_place() and (dtype == 'bfloat16'):
         warnings.warn('MLUPlace only support float16 amp.')
         enable = False
+    # For custom device:
+    if tracer._expected_place.is_custom_place() and (dtype == 'bfloat16'):
+        warnings.warn('CustomPlace only support float16 amp.')
+        enable = False
     # For gpu float16: Compute Capability should >= 7.
     # For gpu bfloat16: Compute Capability should >= 8 & CUDA Version should >= 11.
     if tracer._expected_place.is_gpu_place():
diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py
index c57290861942b..df79b5ab5e482 100644
--- a/python/paddle/fluid/dygraph/amp/loss_scaler.py
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
@@ -107,9 +107,10 @@ def __init__(self,
         if enable and not (tracer._expected_place.is_gpu_place() or
                            tracer._expected_place.is_xpu_place() or
                            tracer._expected_place.is_mlu_place() or
-                           tracer._expected_place.is_npu_place()):
+                           tracer._expected_place.is_npu_place() or
+                           tracer._expected_place.is_custom_place()):
             warnings.warn(
-                'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace and NPUPlace, current place is %s, so it makes no effect.'
+                'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace, NPUPlace and CustomPlace, current place is %s, so it makes no effect.'
                 % tracer._expected_place)
             enable = False
 

From 108aeb28704e64a54f82b8a59266a4e9633f9949 Mon Sep 17 00:00:00 2001
From: sneaxiy <32832641+sneaxiy@users.noreply.github.com>
Date: Thu, 28 Apr 2022 12:02:23 +0800
Subject: [PATCH 06/10] Add gradient merge for DistributedFusedLamb optimizer
 (#40177)

* add gradient merge for DistributedFusedLamb

* use master acc gradient

* fix CI ut

* polish

* remove math_function_impl.h change

* fix test_update_loss_scaling_op.py

* try to fix XPU/NPU CI

* add gm ut
---
 .../operators/amp/update_loss_scaling_op.cc   |  24 ++-
 .../operators/amp/update_loss_scaling_op.cu   |  24 ++-
 .../operators/amp/update_loss_scaling_op.h    |  60 +++++-
 .../amp/update_loss_scaling_op_npu.cc         |   5 +-
 .../optimizers/distributed_fused_lamb_op.cc   |  10 +
 .../optimizers/distributed_fused_lamb_op.cu   | 181 +++++++++++++++++-
 .../fluid/contrib/mixed_precision/amp_nn.py   |   6 +-
 .../contrib/mixed_precision/decorator.py      |   2 +-
 .../fluid/tests/unittests/CMakeLists.txt      |   2 +
 .../distributed_fused_lamb_test_base.py       |  18 +-
 ...est_distributed_fused_lamb_op_with_clip.py |   5 +-
 ...buted_fused_lamb_op_with_gradient_merge.py |  28 +++
 .../optimizer/distributed_fused_lamb.py       |  35 ++++
 13 files changed, 369 insertions(+), 31 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py

diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cc b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
index b974f606720b2..8354650df0237 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
@@ -68,6 +68,18 @@ class UpdateLossScalingOp : public framework::OperatorWithKernel {
 
     return framework::OpKernelType(dtype, ctx.GetPlace());
   }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const framework::Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+#ifndef PADDLE_WITH_XPU
+    if (var_name == "FoundInfinite" || var_name == "StopUpdate") {
+      return expected_kernel_type;
+    }
+#endif
+    return framework::OperatorWithKernel::GetKernelTypeForVar(
+        var_name, tensor, expected_kernel_type);
+  }
 };
 
 class UpdateLossScalingOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -93,6 +105,10 @@ class UpdateLossScalingOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("LossScaling", "(Tensor) 1-dim tensor, updated loss scaling.");
     AddOutput("OutGoodSteps", "(Tensor) 1-dim tensor, pdated good steps.");
     AddOutput("OutBadSteps", "(Tensor) 1-dim tensor, updated bad steps.");
+    AddOutput("StopUpdate",
+              "(Tensor) 1-dim tensor. Stop updating loss scaling, and just "
+              "zero inputs. It has higher priority than Attr(stop_update).")
+        .AsDispensable();
     AddAttr<int>("incr_every_n_steps",
                  "A value represents increasing loss scaling every n "
                  "consecutive steps with finite gradients.");
@@ -131,8 +147,8 @@ decr_every_n_nan_or_inf steps and each step some gradients are infinite.
   }
 };
 
-template <typename T>
-class UpdateLossScalingFunctor<platform::CPUDeviceContext, T> {
+template <typename T, bool IsFoundInfOnCPU>
+class UpdateLossScalingFunctor<platform::CPUDeviceContext, T, IsFoundInfOnCPU> {
  public:
   void operator()(const platform::CPUDeviceContext& ctx,
                   const bool* found_inf_data, const T* pre_loss_scaling_data,
@@ -141,6 +157,10 @@ class UpdateLossScalingFunctor<platform::CPUDeviceContext, T> {
                   const int decr_every_n_nan_or_inf, const float incr_ratio,
                   const float decr_ratio, T* updated_loss_scaling_data,
                   int* good_out_data, int* bad_out_data) const {
+    PADDLE_ENFORCE_EQ(
+        IsFoundInfOnCPU, true,
+        platform::errors::InvalidArgument(
+            "The Input(FoundInfinite) should be on the CPUPlace."));
     Update<T>(found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
               incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
               decr_ratio, updated_loss_scaling_data, good_out_data,
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cu b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
index 6d9cd96a3fb9a..43f8f84578c70 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cu
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
@@ -21,9 +21,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename FoundNanInfFlagT>
 __global__ void GpuUpdateLossScaling(
-    const bool* found_inf_data, const T* pre_loss_scaling_data,
+    const FoundNanInfFlagT found_inf_data, const T* pre_loss_scaling_data,
     const int* good_in_data, const int* bad_in_data,
     const int incr_every_n_steps, const int decr_every_n_nan_or_inf,
     const float incr_ratio, const float decr_ratio,
@@ -70,8 +70,9 @@ __global__ void FusedFillIf(T** outs, const size_t xs_size,
   }
 }
 
-template <typename T>
-class UpdateLossScalingFunctor<platform::CUDADeviceContext, T> {
+template <typename T, bool IsFoundInfOnCPU>
+class UpdateLossScalingFunctor<platform::CUDADeviceContext, T,
+                               IsFoundInfOnCPU> {
  public:
   void operator()(const platform::CUDADeviceContext& dev_ctx,
                   const bool* found_inf_data, const T* pre_loss_scaling_data,
@@ -80,10 +81,17 @@ class UpdateLossScalingFunctor<platform::CUDADeviceContext, T> {
                   const int decr_every_n_nan_or_inf, const float incr_ratio,
                   const float decr_ratio, T* updated_loss_scaling_data,
                   int* good_out_data, int* bad_out_data) const {
-    GpuUpdateLossScaling<T><<<1, 1, 0, dev_ctx.stream()>>>(
-        found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
-        incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
-        updated_loss_scaling_data, good_out_data, bad_out_data);
+    if (IsFoundInfOnCPU) {
+      GpuUpdateLossScaling<T><<<1, 1, 0, dev_ctx.stream()>>>(
+          *found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
+          incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
+          updated_loss_scaling_data, good_out_data, bad_out_data);
+    } else {
+      GpuUpdateLossScaling<T><<<1, 1, 0, dev_ctx.stream()>>>(
+          found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
+          incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
+          updated_loss_scaling_data, good_out_data, bad_out_data);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.h b/paddle/fluid/operators/amp/update_loss_scaling_op.h
index d6eddd36a4551..41eb94247f593 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.h
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.h
@@ -25,6 +25,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -40,8 +41,16 @@ inline HOSTDEVICE bool check_finite(T value) {
 #endif
 }
 
-template <typename T>
-inline HOSTDEVICE void Update(const bool* found_inf_data,
+inline HOSTDEVICE bool IsFoundNanInf(const bool found_nan_inf_data) {
+  return found_nan_inf_data;
+}
+
+inline HOSTDEVICE bool IsFoundNanInf(const bool* found_nan_inf_data) {
+  return *found_nan_inf_data;
+}
+
+template <typename T, typename FoundInfFlagT>
+inline HOSTDEVICE void Update(const FoundInfFlagT found_inf_data,
                               const T* pre_loss_scaling_data,
                               const int* good_in_data, const int* bad_in_data,
                               const int incr_every_n_steps,
@@ -49,7 +58,7 @@ inline HOSTDEVICE void Update(const bool* found_inf_data,
                               const float incr_ratio, const float decr_ratio,
                               T* updated_loss_scaling_data, int* good_out_data,
                               int* bad_out_data) {
-  if (*found_inf_data) {
+  if (IsFoundNanInf(found_inf_data)) {
     *good_out_data = 0;
     *bad_out_data = *bad_in_data + 1;
     if (*bad_out_data == decr_every_n_nan_or_inf) {
@@ -72,7 +81,7 @@ inline HOSTDEVICE void Update(const bool* found_inf_data,
   }
 }
 
-template <typename DeviceContext, typename T>
+template <typename DeviceContext, typename T, bool IsFoundInfOnCPU>
 class UpdateLossScalingFunctor {
  public:
   void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data,
@@ -106,9 +115,33 @@ class UpdateLossScalingKernel : public framework::OpKernel<T> {
                       platform::errors::InvalidArgument(
                           "FoundInfinite must has only one element."));
     const bool* found_inf_data = found_inf->data<bool>();
+    bool is_found_inf_on_cpu = platform::is_cpu_place(found_inf->place());
+
+    if (is_found_inf_on_cpu) {
+      if (*found_inf_data) {
+        phi::funcs::SetConstant<DeviceContext, T> set_constant;
+        for (auto* out : outs) {
+          out->mutable_data<T>(dev_ctx.GetPlace());
+          set_constant(dev_ctx, out, static_cast<T>(0));
+        }
+      }
+    } else {
+      LazyZeros<DeviceContext, T>{}(dev_ctx, found_inf_data, xs, outs);
+    }
 
-    LazyZeros<DeviceContext, T>{}(dev_ctx, found_inf_data, xs, outs);
-    const bool stop_update = ctx.Attr<bool>("stop_update");
+    const auto* stop_update_tensor = ctx.Input<Tensor>("StopUpdate");
+    bool stop_update = false;
+    if (stop_update_tensor && stop_update_tensor->IsInitialized()) {
+      if (platform::is_cpu_place(stop_update_tensor->place())) {
+        stop_update = stop_update_tensor->data<bool>()[0];
+      } else {
+        framework::Tensor tmp_tensor;
+        framework::TensorCopySync(*stop_update_tensor, platform::CPUPlace(),
+                                  &tmp_tensor);
+        stop_update = tmp_tensor.data<bool>()[0];
+      }
+    }
+    stop_update |= ctx.Attr<bool>("stop_update");
     if (stop_update) {
       return;
     }
@@ -133,10 +166,17 @@ class UpdateLossScalingKernel : public framework::OpKernel<T> {
         ctx.Attr<int>("decr_every_n_nan_or_inf");
     const float incr_ratio = ctx.Attr<float>("incr_ratio");
     const float decr_ratio = ctx.Attr<float>("decr_ratio");
-    UpdateLossScalingFunctor<DeviceContext, MPDType>{}(
-        dev_ctx, found_inf_data, pre_loss_scaling_data, good_in_data,
-        bad_in_data, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
-        decr_ratio, updated_loss_scaling_data, good_out_data, bad_out_data);
+    if (is_found_inf_on_cpu) {
+      UpdateLossScalingFunctor<DeviceContext, MPDType, true>{}(
+          dev_ctx, found_inf_data, pre_loss_scaling_data, good_in_data,
+          bad_in_data, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
+          decr_ratio, updated_loss_scaling_data, good_out_data, bad_out_data);
+    } else {
+      UpdateLossScalingFunctor<DeviceContext, MPDType, false>{}(
+          dev_ctx, found_inf_data, pre_loss_scaling_data, good_in_data,
+          bad_in_data, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
+          decr_ratio, updated_loss_scaling_data, good_out_data, bad_out_data);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
index 1393da7dd57a7..5808841333f08 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -131,7 +131,8 @@ void Update(const platform::NPUDeviceContext& ctx,
 }
 
 template <typename T>
-class UpdateLossScalingFunctor<platform::NPUDeviceContext, T> {
+class UpdateLossScalingFunctor<platform::NPUDeviceContext, T,
+                               /*IsFoundInfOnCPU=*/true> {
  public:
   void operator()(const platform::NPUDeviceContext& dev_ctx,
                   const std::vector<bool> found_inf_vec,
@@ -236,7 +237,7 @@ class UpdateLossScalingNPUKernel : public framework::OpKernel<T> {
         ctx.Attr<int>("decr_every_n_nan_or_inf");
     const float incr_ratio = ctx.Attr<float>("incr_ratio");
     const float decr_ratio = ctx.Attr<float>("decr_ratio");
-    UpdateLossScalingFunctor<DeviceContext, MPDType>{}(
+    UpdateLossScalingFunctor<DeviceContext, MPDType, true>{}(
         dev_ctx, found_inf_vec, pre_loss_scaling, good_in, bad_in,
         incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
         updated_loss_scaling, good_out, bad_out);
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
index 161483c3420fc..0159e250d317e 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
@@ -100,6 +100,10 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsDispensable();
     AddOutput("FP16FusedParamOut", "The updated FP16FusedParam.")
         .AsDispensable();
+    AddOutput("FP32AccFusedGrad", "The accumulated FP32 gradients.")
+        .AsDispensable();
+    AddOutput("FP16AccFusedGrad", "The accumulated FP16 gradients.")
+        .AsDispensable();
 
     AddOutput("Moment1Out", "The updated Moment1.");
     AddOutput("Moment2Out", "The updated Moment2.");
@@ -110,8 +114,14 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsDuplicable();
 
     AddOutput("FoundInf", "Whether there is NaN/Inf");
+    AddOutput("AccStep", "The training steps.").AsDispensable();
+    AddOutput("StopUpdate",
+              "Whether the parameter updating is stopped when the gradient "
+              "accumulated steps is less than Attr(acc_steps).")
+        .AsDispensable();
     AddOutput("Step", "The global step which excludes the NaN/Inf step.");
 
+    AddAttr<int>("acc_steps", "The gradient accumulation steps.").SetDefault(1);
     AddAttr<float>("beta1", "The initial Beta1Pow value.");
     AddAttr<float>("beta2", "The initial Beta2Pow value.");
     AddAttr<float>("epsilon",
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index f445a140f27a3..c857c6de4d093 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -1041,6 +1041,58 @@ static void CheckHasNanInfGrad(const float *fp32_grad, int fp32_numel,
   }
 }
 
+template <typename T1, typename T2, typename T3, int VecSize>
+static __global__ void ElementwiseAddWithCastCUDAKernel(const T1 *x,
+                                                        const T2 *y, T3 *z,
+                                                        int n) {
+  static_assert(sizeof(T1) <= sizeof(T2),
+                "sizeof(T1) must be smaller than sizeof(T2).");
+  using MT = MasterT<T2>;
+
+  int i = (threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
+  int stride = (blockDim.x * gridDim.x) * VecSize;
+  for (; i + VecSize <= n; i += stride) {
+    phi::AlignedVector<T1, VecSize> x_vec;
+    phi::AlignedVector<T2, VecSize> y_vec;
+    phi::AlignedVector<T3, VecSize> z_vec;
+    phi::Load(x + i, &x_vec);
+    phi::Load(y + i, &y_vec);
+#pragma unroll
+    for (int j = 0; j < VecSize; ++j) {
+      auto x_tmp = static_cast<MT>(x_vec[j]);
+      auto y_tmp = static_cast<MT>(y_vec[j]);
+      z_vec[j] = static_cast<T3>(x_tmp + y_tmp);
+    }
+    phi::Store(z_vec, z + i);
+  }
+
+  for (; i < n; ++i) {
+    auto x_tmp = static_cast<MT>(x[i]);
+    auto y_tmp = static_cast<MT>(y[i]);
+    z[i] = static_cast<T3>(x_tmp + y_tmp);
+  }
+}
+
+template <typename T1, typename T2, typename T3>
+static void LaunchElementwiseAddWithCastKernel(
+    const platform::CUDADeviceContext &dev_ctx, const T1 *x, const T2 *y, T3 *z,
+    int n, gpuStream_t stream) {
+  int vec_size =
+      std::min(std::min(GetChunkedVecSize(x, 0), GetChunkedVecSize(y, 0)),
+               GetChunkedVecSize(z, 0));
+  auto config = platform::GetGpuLaunchConfig1D(dev_ctx, n, vec_size);
+
+#define PD_LAUNCH_ELEMENTWISE_ADD_WITH_CAST_KERNEL                            \
+  do {                                                                        \
+    ElementwiseAddWithCastCUDAKernel<T1, T2, T3, kVecSize><<<                 \
+        config.block_per_grid, config.thread_per_block, 0, stream>>>(x, y, z, \
+                                                                     n);      \
+  } while (0)
+
+  PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_ELEMENTWISE_ADD_WITH_CAST_KERNEL);
+#undef PD_LAUNCH_ELEMENTWISE_ADD_WITH_CAST_KERNEL
+}
+
 template <typename T>
 class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -1051,6 +1103,9 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
     auto stream = dev_ctx.stream();
     auto place = dev_ctx.GetPlace();
 
+    auto *found_inf_t = ctx.Output<framework::Tensor>("FoundInf");
+    found_inf_t->Resize({1});
+
     // Step 1: Get fp16 param and grad tensors
     int64_t fp16_numel;
     auto *fp16_param = GetSameInOutTensorPtr<platform::float16, true>(
@@ -1095,6 +1150,128 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
                           "Too many parameter number. Only <= %d is supported.",
                           std::numeric_limits<int>::max()));
 
+    auto acc_steps = ctx.Attr<int>("acc_steps");
+    PADDLE_ENFORCE_GE(
+        acc_steps, 1,
+        platform::errors::InvalidArgument(
+            "The gradient accumulation steps should be not less than 1."));
+    if (acc_steps > 1) {
+      auto *step_t = ctx.Output<framework::Tensor>("AccStep");
+      PADDLE_ENFORCE_NOT_NULL(
+          step_t,
+          platform::errors::InvalidArgument(
+              "Output(AccStep) cannot be nullptr when Attr(acc_steps) > 1."));
+      bool is_initialized = step_t->IsInitialized();
+      int64_t *step_ptr;
+      if (is_initialized) {
+        step_ptr = step_t->mutable_data<int64_t>(platform::CPUPlace());
+        ++(*step_ptr);
+      } else {
+        step_t->Resize({1});
+        step_ptr = step_t->mutable_data<int64_t>(platform::CPUPlace());
+        *step_ptr = 1;
+      }
+      int64_t rounded_step = (*step_ptr) % acc_steps;
+
+      float *fp32_acc_grad = nullptr;
+      if (has_fp32_param) {
+        auto *fp32_acc_grad_t =
+            ctx.Output<framework::Tensor>("FP32AccFusedGrad");
+        PADDLE_ENFORCE_NOT_NULL(
+            fp32_acc_grad_t, platform::errors::InvalidArgument(
+                                 "Output(FP32AccFusedGrad) cannot be nullptr "
+                                 "when Attr(acc_steps) > 1."));
+        if (!fp32_acc_grad_t->IsInitialized()) {
+          fp32_acc_grad_t->Resize({static_cast<int64_t>(fp32_numel)});
+          fp32_acc_grad = fp32_acc_grad_t->mutable_data<float>(place);
+        } else {
+          fp32_acc_grad = fp32_acc_grad_t->data<float>();
+        }
+      }
+
+      platform::float16 *fp16_acc_grad = nullptr;
+      float *master_acc_grad = nullptr;
+      if (has_fp16_param) {
+        auto *fp16_acc_grad_t =
+            ctx.Output<framework::Tensor>("FP16AccFusedGrad");
+        PADDLE_ENFORCE_NOT_NULL(
+            fp16_acc_grad_t, platform::errors::InvalidArgument(
+                                 "Output(FP16AccFusedGrad) cannot be nullptr "
+                                 "when Attr(acc_steps) > 1."));
+        if (!fp16_acc_grad_t->IsInitialized()) {
+          fp16_acc_grad_t->Resize({static_cast<int64_t>(3 * fp16_numel)});
+          fp16_acc_grad =
+              fp16_acc_grad_t->mutable_data<platform::float16>(place);
+        } else {
+          fp16_acc_grad = fp16_acc_grad_t->data<platform::float16>();
+        }
+        master_acc_grad = reinterpret_cast<float *>(fp16_acc_grad + fp16_numel);
+      }
+
+      // Inplace addto
+      if (has_fp32_param) {
+        if (rounded_step == 1) {
+          memory::Copy(place, fp32_acc_grad, place, fp32_grad,
+                       fp32_numel * sizeof(float), stream);
+        } else {
+          LaunchElementwiseAddWithCastKernel(dev_ctx, fp32_grad, fp32_acc_grad,
+                                             fp32_acc_grad, fp32_numel, stream);
+        }
+      }
+
+      if (has_fp16_param) {
+        if (acc_steps == 2) {
+          if (rounded_step == 0) {
+            LaunchElementwiseAddWithCastKernel(dev_ctx, fp16_acc_grad,
+                                               fp16_grad, fp16_acc_grad,
+                                               fp16_numel, stream);
+          } else {
+            memory::Copy(place, fp16_acc_grad, place, fp16_grad,
+                         fp16_numel * sizeof(platform::float16), stream);
+          }
+        } else {  // acc_steps >= 3
+          if (rounded_step == 0) {
+            LaunchElementwiseAddWithCastKernel(dev_ctx, fp16_grad,
+                                               master_acc_grad, fp16_acc_grad,
+                                               fp16_numel, stream);
+          } else if (rounded_step == 1) {
+            memory::Copy(place, fp16_acc_grad, place, fp16_grad,
+                         fp16_numel * sizeof(platform::float16), stream);
+          } else if (rounded_step == 2) {
+            LaunchElementwiseAddWithCastKernel(dev_ctx, fp16_grad,
+                                               fp16_acc_grad, master_acc_grad,
+                                               fp16_numel, stream);
+          } else {
+            LaunchElementwiseAddWithCastKernel(dev_ctx, fp16_grad,
+                                               master_acc_grad, master_acc_grad,
+                                               fp16_numel, stream);
+          }
+        }
+      }
+
+      auto *stop_update_t = ctx.Output<framework::Tensor>("StopUpdate");
+      stop_update_t->Resize({1});
+      auto *stop_update =
+          stop_update_t->mutable_data<bool>(platform::CPUPlace());
+
+      auto *found_inf_cpu =
+          found_inf_t->mutable_data<bool>(platform::CPUPlace());
+
+      if (rounded_step != 0) {
+        *stop_update = true;
+        auto *found_inf_cpu =
+            found_inf_t->mutable_data<bool>(platform::CPUPlace());
+        *found_inf_cpu = false;
+        return;
+      } else {
+        // swap pointer
+        fp32_grad = fp32_acc_grad;
+        fp16_grad = fp16_acc_grad;
+        *stop_update = false;
+        found_inf_t->clear();
+      }
+    }
+
     // Step 3: Get ParamInfo
     const auto *param_info_tensor = GetInputTensorPtr<int>(ctx, "ParamInfo");
     auto fp32_local_start_idx = param_info_tensor[0];
@@ -1122,7 +1299,7 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
             << " , fp16_global_param_num = " << fp16_global_param_num;
 
     // Step 4: Get LearningRate, Moment1, Moment2, Beta1Pow, Beta2Pow,
-    // GlobalScale, FoundInf
+    // GlobalScale
     const auto *global_scale = GetInputTensorPtr<float>(ctx, "GlobalScale");
     const auto *lr = GetInputTensorPtr<float>(ctx, "LearningRate");
     int64_t partial_numel = 0;
@@ -1157,8 +1334,6 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
     auto *beta2pow =
         GetSameInOutTensorPtr<float>(ctx, place, "Beta2Pow", "Beta2PowOut");
 
-    auto *found_inf_t = ctx.Output<framework::Tensor>("FoundInf");
-    found_inf_t->Resize({1});
     auto *found_inf = found_inf_t->mutable_data<bool>(place);
 
     // Step 5: Get attributes weight_decay, beta1, beta2, epsilon,
diff --git a/python/paddle/fluid/contrib/mixed_precision/amp_nn.py b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
index 588eb2a29f555..c5b9b9e71f6be 100644
--- a/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
+++ b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
@@ -129,9 +129,13 @@ def update_loss_scaling(x,
         'decr_every_n_nan_or_inf': decr_every_n_nan_or_inf,
         'incr_ratio': incr_ratio,
         'decr_ratio': decr_ratio,
-        'stop_update': stop_update
     }
 
+    if isinstance(stop_update, Variable):
+        inputs['StopUpdate'] = stop_update
+    else:
+        attrs['stop_update'] = stop_update
+
     helper.append_op(
         type='update_loss_scaling', inputs=inputs, outputs=outputs, attrs=attrs)
 
diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py
index c6e2bcb8b1a24..c3720396e1d77 100644
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
@@ -432,7 +432,7 @@ def _add_dynamic_loss_scaling(self, params_grads, found_inf):
                     self._decr_every_n_nan_or_inf,
                     self._incr_ratio,
                     self._decr_ratio,
-                    stop_update=False,
+                    stop_update=self._optimizer._get_stop_update_var(),
                     name="update_loss_scaling")
             return
 
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 12ed7b975af0c..15dd3d8b8f509 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -914,6 +914,7 @@ set_tests_properties(test_parallel_executor_crf test_sync_batch_norm_op test_inp
         test_parallel_executor_seresnext_with_fuse_all_reduce_gpu
         test_distributed_fused_lamb_op_with_clip
         test_distributed_fused_lamb_op_without_clip
+        test_distributed_fused_lamb_op_with_gradient_merge
         test_parallel_executor_fetch_isolated_var
         PROPERTIES LABELS "RUN_TYPE=DIST")
 
@@ -1047,6 +1048,7 @@ set_tests_properties(test_row_conv_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_gpu PROPERTIES TIMEOUT 120)
 set_tests_properties(test_distributed_fused_lamb_op_with_clip PROPERTIES TIMEOUT 120)
 set_tests_properties(test_distributed_fused_lamb_op_without_clip PROPERTIES TIMEOUT 120)
+set_tests_properties(test_distributed_fused_lamb_op_with_gradient_merge PROPERTIES TIMEOUT 120)
 set_tests_properties(test_elementwise_min_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_nan_inf PROPERTIES TIMEOUT 120)
 set_tests_properties(test_deformable_conv_v1_op PROPERTIES TIMEOUT 300)
diff --git a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
index 00d2a1f71d6bd..0af7d40a2f02e 100644
--- a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
+++ b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
@@ -149,6 +149,7 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs):
             kwargs['exclude_from_weight_decay_fn'] = exclude_fn
             kwargs['lamb_weight_decay'] = 0.1
 
+            gm_steps = kwargs['gradient_accumulation_steps']
             if use_distributed_lamb:
                 optimizer_class = DistributedFusedLamb
                 kwargs = dict(kwargs)
@@ -163,6 +164,7 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs):
                 )
                 kwargs['grad_clip'] = GradClipDecorator(base_clip,
                                                         clip_after_allreduce)
+                kwargs.pop('gradient_accumulation_steps', None)
 
             optimizer = optimizer_class(**kwargs)
             get_parameter = optimizer._get_parameter
@@ -173,6 +175,7 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs):
             if use_fp16:
                 if not use_distributed_lamb:
                     optimizer._multi_precision = True
+
                 optimizer = paddle.static.amp.decorate(
                     optimizer,
                     amp_list,
@@ -180,6 +183,13 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs):
                     use_dynamic_loss_scaling=False,
                     use_pure_fp16=use_fp16,
                     use_fp16_guard=use_fp16)
+                amp_init = optimizer.amp_init
+            else:
+                amp_init = None
+
+            if gm_steps > 1 and not use_distributed_lamb:
+                optimizer = paddle.fluid.optimizer.GradientMergeOptimizer(
+                    optimizer, k_steps=gm_steps, avg=False)
 
             params_grads = optimizer.backward(loss, startup)
             op_num = len(main.global_block().ops)
@@ -211,7 +221,7 @@ def gen_random_grad_tensor(grad):
         return grad_t
 
     def reader():
-        for _ in range(5):
+        for _ in range(6):
             yield dict(
                 [(grad.name, gen_random_grad_tensor(grad)) for grad in grads])
 
@@ -223,8 +233,8 @@ def reader():
         place = paddle.CUDAPlace(dev_id)
         exe = paddle.static.Executor(place)
         exe.run(startup)
-        if use_fp16:
-            optimizer.amp_init(place)
+        if amp_init is not None:
+            amp_init(place)
 
         master_p_ts = []
         for p in params:
@@ -258,10 +268,12 @@ def config(self):
             distutils.util.strtobool(
                 os.getenv('CLIP_AFTER_ALLREDUCE', 'True')))
         max_global_norm = float(os.getenv('MAX_GLOBAL_NORM', -1.0))
+        gm_steps = int(os.getenv('GRADIENT_MERGE_STEPS', 1))
         print('clip_after_allreduce = {}, max_global_norm = {}'.format(
             clip_after_allreduce, max_global_norm))
         return {
             'clip_after_allreduce': clip_after_allreduce,
+            'gradient_accumulation_steps': gm_steps,
             'grad_clip': paddle.nn.ClipGradByGlobalNorm(max_global_norm)
             if max_global_norm > 0 else None,
         }
diff --git a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py
index af99529adfa74..315580dd31ad7 100644
--- a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py
@@ -34,7 +34,9 @@ def remove_file_if_exists(file_name):
         shutil.rmtree(file_name)
 
 
-def run_test(clip_after_allreduce=True, max_global_norm=-1.0):
+def run_test(clip_after_allreduce=True,
+             max_global_norm=-1.0,
+             gradient_merge_steps=1):
     if not paddle.is_compiled_with_cuda():
         return
     if os.name == 'nt':
@@ -55,6 +57,7 @@ def run_test(clip_after_allreduce=True, max_global_norm=-1.0):
 
     os.environ['CLIP_AFTER_ALLREDUCE'] = str(clip_after_allreduce)
     os.environ['MAX_GLOBAL_NORM'] = str(max_global_norm)
+    os.environ['GRADIENT_MERGE_STEPS'] = str(gradient_merge_steps)
 
     touch_file_env = 'SUCCESS_TOUCH_FILE'
     touch_file_name = 'distributed_fused_lamb_touch_file_{}'.format(os.getpid())
diff --git a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py
new file mode 100644
index 0000000000000..1822b77d0d0e5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from test_distributed_fused_lamb_op_with_clip import run_test
+import unittest
+
+
+class TestDistributedFusedLambGradientMerge(unittest.TestCase):
+    def test_gm(self):
+        run_test(
+            clip_after_allreduce=True,
+            max_global_norm=-1.0,
+            gradient_merge_steps=2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
index 74b5398230dee..4d40a477ffc07 100644
--- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py
+++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
@@ -38,6 +38,7 @@ def __init__(self,
                  is_grad_scaled_by_nranks=True,
                  alignment=128,
                  use_master_param_norm=True,
+                 gradient_accumulation_steps=1,
                  name=None):
         assert not framework._non_static_mode(
         ), "DistributedFusedLamb does not support dygraph mode"
@@ -63,6 +64,9 @@ def __init__(self,
         self._scale = None
         self._ring_id = 0
         self._use_master_param_norm = use_master_param_norm
+        self._gradient_accumulation_steps = gradient_accumulation_steps
+        assert self._gradient_accumulation_steps >= 1
+
         self.helper = LayerHelper('distributed_fused_lamb')
         self._supports_check_nan_inf = True  # very import flag for AMP
 
@@ -73,8 +77,19 @@ def __init__(self,
             dtype=core.VarDesc.VarType.BOOL)
         self._step = None
 
+        if self._gradient_accumulation_steps > 1:
+            self._stop_update = main_block.create_var(
+                name=unique_name.generate('stop_update'),
+                shape=[1],
+                dtype=core.VarDesc.VarType.BOOL)
+        else:
+            self._stop_update = None
+
         self._param_to_master_param = {}
 
+    def _get_stop_update_var(self):
+        return self._stop_update if self._stop_update is not None else False
+
     def _set_step(self, step):
         self._step = step
 
@@ -194,6 +209,20 @@ def _apply_gradients_impl(self, params_grads):
         param_order = self._create_persistable_var('param_order', dtype='int32')
         param_order.is_distributed = True
 
+        if self._gradient_accumulation_steps > 1:
+            fp32_acc_fused_grad = [
+                self._create_persistable_var('fp32_acc_fused_grad')
+            ]
+            fp16_acc_fused_grad = [
+                self._create_persistable_var(
+                    'fp16_acc_fused_grad', dtype='float16')
+            ]
+            acc_step = [self._create_persistable_var('acc_step', dtype='int64')]
+        else:
+            fp32_acc_fused_grad = []
+            fp16_acc_fused_grad = []
+            acc_step = []
+
         step = self._get_or_create_step()
 
         rank = get_rank()
@@ -298,6 +327,11 @@ def _apply_gradients_impl(self, params_grads):
                 'ParamOut': params,
                 'GradOut': grads,
                 'FoundInf': [self._found_inf],
+                'FP32AccFusedGrad': fp32_acc_fused_grad,
+                'FP16AccFusedGrad': fp16_acc_fused_grad,
+                'AccStep': acc_step,
+                'StopUpdate': self._stop_update
+                if self._stop_update is not None else [],
                 'Step': [step],
             },
             attrs={
@@ -311,5 +345,6 @@ def _apply_gradients_impl(self, params_grads):
                 'ring_id': self._ring_id,
                 'use_master_param_norm': self._use_master_param_norm,
                 'is_grad_scaled_by_nranks': self._is_grad_scaled_by_nranks,
+                'acc_steps': self._gradient_accumulation_steps,
             })
         return [lamb_op]

From afa846d9f8fe620942f8ac15ea43e5fb6052cbaf Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Thu, 28 Apr 2022 12:59:42 +0800
Subject: [PATCH 07/10] fix error report. (#42333)

---
 paddle/fluid/inference/api/analysis_predictor.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 015f4471a0246..4f0d4a908380f 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -48,6 +48,7 @@
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
+#include "paddle/phi/common/place.h"
 #include "paddle/utils/string/split.h"
 
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
@@ -1641,7 +1642,9 @@ AnalysisPredictor::~AnalysisPredictor() {
     StatisticShapeRangeInfo();
   }
 
-  memory::Release(place_);
+  if (place_.GetType() != phi::AllocationType::UNDEFINED) {
+    memory::Release(place_);
+  }
 }
 
 std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {

From 8ad38701f2d1726f376b0f1cdff9bb481b993dba Mon Sep 17 00:00:00 2001
From: Tomasz Socha <tomasz.socha@intel.com>
Date: Thu, 28 Apr 2022 09:11:12 +0200
Subject: [PATCH 08/10] Bfloat16 refactor (#42238)

* Refactor Quantization

* Refactor Dequantization

* Classy solution

* Style I

* Style II

* Style III

* Use VLOG(4) for debug info

* Style IV
---
 .../framework/ir/graph_pattern_detector.cc    |  37 +-
 .../framework/ir/graph_pattern_detector.h     |  33 +-
 .../framework/ir/mkldnn/cpu_bfloat16_pass.cc  | 448 ++++++++----------
 .../framework/ir/mkldnn/cpu_bfloat16_pass.h   |   2 -
 .../ir/mkldnn/cpu_bfloat16_placement_pass.cc  |  45 +-
 .../ir/mkldnn/cpu_bfloat16_placement_pass.h   |  11 +-
 6 files changed, 228 insertions(+), 348 deletions(-)

diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 8eb1b64a2763a..fbd8fda131b6d 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2665,41 +2665,8 @@ PDNode *patterns::UnsupportedBfloat16::operator()() {
   return op;
 }
 
-PDNode *patterns::LastBfloat16Ops::operator()() {
-  auto *op = pattern->NewNode(op_repr())->assert_is_op();
-  op->assert_more([&](Node *node) {
-    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
-           "bfloat16";
-  });
-  auto *op_out = pattern->NewNode(op_out_repr())->AsOutput();
-  op->LinksTo({op_out});
-  return op_out;
-}
-
-PDNode *patterns::FirstBfloat16Ops::operator()() {
-  auto *op_in = pattern->NewNode(op_in_repr())->AsInput();
-
-  auto *op = pattern->NewNode(op_repr())->assert_is_op();
-  op->assert_more([&](Node *node) {
-    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
-           "bfloat16";
-  });
-
-  op->LinksFrom({op_in});
-  return op;
-}
-
-PDNode *patterns::DuplicatedInputs::operator()() {
-  auto op = pattern->NewNode(op_repr())->assert_is_ops({"concat", "sum"});
-  op->assert_more([&](Node *node) {
-    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
-           "bfloat16";
-  });
-  return op;
-}
-
-PDNode *patterns::DuplicatedOutputs::operator()() {
-  auto op = pattern->NewNode(op_repr())->assert_is_ops({"split"});
+PDNode *patterns::Bloat16Ops::operator()() {
+  auto op = pattern->NewNode(op_repr())->assert_is_op();
   op->assert_more([&](Node *node) {
     return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
            "bfloat16";
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 434ede6cf7a3b..d7e265fe28bf9 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1565,36 +1565,9 @@ struct UnsupportedBfloat16 : public PatternBase {
   PATTERN_DECL_NODE(op);
 };
 
-struct LastBfloat16Ops : public PatternBase {
-  LastBfloat16Ops(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "last_bfloat16_ops") {}
-  PDNode* operator()();
-
-  PATTERN_DECL_NODE(op);
-  PATTERN_DECL_NODE(op_out);
-};
-
-struct FirstBfloat16Ops : public PatternBase {
-  FirstBfloat16Ops(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "first_bfloat16_ops") {}
-  PDNode* operator()();
-
-  PATTERN_DECL_NODE(op_in);
-  PATTERN_DECL_NODE(op);
-};
-
-struct DuplicatedInputs : public PatternBase {
-  DuplicatedInputs(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "many_inputs_op") {}
-
-  PDNode* operator()();
-
-  PATTERN_DECL_NODE(op);
-};
-
-struct DuplicatedOutputs : public PatternBase {
-  DuplicatedOutputs(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "many_outputs_op") {}
+struct Bloat16Ops : public PatternBase {
+  Bloat16Ops(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "many_bfloat16_ops") {}
 
   PDNode* operator()();
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
index f1bd34a5ad4f6..62b2be712beef 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
@@ -22,290 +22,226 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-using string::PrettyLogDetail;
+namespace {
+class Quanter {
+ public:
+  void AddQuantOps() {
+    if (IsNotPermittedOpType()) return;
 
-void UnlinkNodes(ir::Node* a, ir::Node* b) {
-  a->outputs.erase(std::remove(a->outputs.begin(), a->outputs.end(), b),
-                   a->outputs.end());
-  b->inputs.erase(std::remove(b->inputs.begin(), b->inputs.end(), a),
-                  b->inputs.end());
-}
+    std::vector<std::string> linked_xputs;
 
-// Checking whether a reorder from FP32 to BF16 should be added before the input
-// to the operator
-bool IsPermittedInputName(const std::string& input_name) {
-  // Only the inputs listed in \"permitted_names\" requires quanitization before
-  // the bfloat16 operator. Other inputs, such as Filter and Bias are reordered
-  // in the kernel.
-  const std::vector<std::string> permitted_names = {"X", "Y", "Input",
-                                                    "ResidualData"};
-  return (std::find(permitted_names.begin(), permitted_names.end(),
-                    input_name) != permitted_names.end());
-}
+    for (const auto& logical_xput : op_xputs) {
+      std::vector<std::string> quant_xput_names;
+      quant_xput_names.reserve(xputs_map.size());
 
-// Checking whether a reorder from BF16 to FP32 should be added after the output
-// to the operator
-bool IsPermittedOutputName(const std::string& output_name) {
-  // XShape is output in transpose2 and reshape2 operators used to store the
-  // shape and lod of X. So this output do not need dequantize before.
-  return (output_name != "XShape");
-}
+      const auto& logical_xput_name = logical_xput.first;
+      if (IsNotPermittedName(logical_xput_name)) continue;
 
-void AddQuantize(Graph* g, ir::Node* op, ir::Node* op_in,
-                 int& quantize_counter) {
-  std::vector<std::string> input_names;
-
-  // Find the name of the input linking op to op_in
-  for (auto name : op->Op()->InputNames())
-    for (auto input_name : op->Op()->Input(name))
-      if (input_name == op_in->Name() && IsPermittedInputName(name))
-        input_names.push_back(name);
-
-  if (input_names.empty()) return;
-
-  VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out"));
-  auto* quantize_out_node = g->CreateVarNode(&quantize_out_desc);
-
-  OpDesc q_desc;
-  q_desc.SetType("quantize");
-  q_desc.SetInput("Input", std::vector<std::string>({op_in->Name()}));
-  q_desc.SetOutput("Output",
-                   std::vector<std::string>({quantize_out_node->Name()}));
-  q_desc.SetAttr("Scale", 1.f);
-  q_desc.SetAttr("Shift", 0.0f);
-  q_desc.SetAttr("bfloat16", true);
-  q_desc.SetAttr("output_format", op->Op()->HasAttr("data_layout")
-                                      ? op->Op()->GetAttr("data_layout")
-                                      : std::string("NCHW"));
-  auto quantize_op = g->CreateOpNode(&q_desc);  // OpDesc will be copied.
-
-  for (auto name = input_names.begin(); name < input_names.end(); name++)
-    op->Op()->SetInput(*name,
-                       std::vector<std::string>({quantize_out_node->Name()}));
-
-  UnlinkNodes(op_in, op);
-  IR_NODE_LINK_TO(op_in, quantize_op);
-  IR_NODE_LINK_TO(quantize_op, quantize_out_node);
-  IR_NODE_LINK_TO(quantize_out_node, op);
-  quantize_counter++;
-}
+      const auto& physical_xputs_names = logical_xput.second;
+      for (const auto& physical_xput_name : physical_xputs_names) {
+        if (IsAlreadyLinked(linked_xputs, physical_xput_name)) continue;
 
-void AddQuantizes(Graph* g, ir::Node* op, int& quantize_counter) {
-  auto inputs = op->inputs;
-  PADDLE_ENFORCE_GE(inputs.size(), 1,
-                    platform::errors::InvalidArgument(
-                        "OP(%s)'s inputs(%d) must be equal or greater than 1.",
-                        op->Name(), inputs.size()));
-  PADDLE_ENFORCE_EQ(op->outputs.size(), 1,
-                    platform::errors::InvalidArgument(
-                        "OP(%s)'s outputs(%d) must be equal to 1.", op->Name(),
-                        op->outputs.size()));
-
-  OpDesc q_desc;
-  q_desc.SetType("quantize");
-
-  std::vector<Node*> quantize_out_nodes(inputs.size());
-  std::vector<std::string> quantize_out_node_names(inputs.size());
-
-  for (size_t i = 0; i < inputs.size(); i++) {
-    VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out"));
-    quantize_out_nodes[i] = g->CreateVarNode(&quantize_out_desc);
-    quantize_out_node_names[i] = quantize_out_nodes[i]->Name();
-
-    q_desc.SetInput("Input", std::vector<std::string>({inputs[i]->Name()}));
-    q_desc.SetOutput("Output",
-                     std::vector<std::string>({quantize_out_node_names[i]}));
-    q_desc.SetAttr("Scale", 1.f);
-    q_desc.SetAttr("Shift", 0.0f);
-    q_desc.SetAttr("bfloat16", true);
-    q_desc.SetAttr("output_format", op->Op()->HasAttr("data_layout")
-                                        ? op->Op()->GetAttr("data_layout")
-                                        : std::string("NCHW"));
-    auto quantize_op = g->CreateOpNode(&q_desc);  // OpDesc will be copied.
-
-    UnlinkNodes(inputs[i], op);
-    IR_NODE_LINK_TO(inputs[i], quantize_op);
-    IR_NODE_LINK_TO(quantize_op, quantize_out_nodes[i]);
-    IR_NODE_LINK_TO(quantize_out_nodes[i], op);
-    quantize_counter++;
+        VarDesc quant_x_desc(
+            patterns::PDNodeName(get_op_type(), get_op_edge()));
+        auto quant_x_node = graph.CreateVarNode(&quant_x_desc);
+        const auto xput_name = quant_x_node->Name();
+        quant_xput_names.emplace_back(xput_name);
+
+        auto quant_op = create_quant_op(physical_xput_name, xput_name);
+
+        auto physical_xput_node = xputs_map[physical_xput_name];
+        link_nodes(physical_xput_node, quant_op, quant_x_node);
+        counter++;
+        linked_xputs.push_back(physical_xput_name);
+      }
+
+      set_edge(logical_xput_name, quant_xput_names);
+    }
   }
 
-  op->Op()->SetInput("X", quantize_out_node_names);
-}
+  int get_counter() const { return counter; }
 
-// Operators like Concat and Sum have a single input name X, which actually
-// consists of multiple inputs. Such operators require a different way to find
-// pattern and add quantize ops.
-void AddReoderBeforeDuplicatedInputs(ir::Graph* graph, int& quantize_counter) {
-  GraphPatternDetector gpd;
-  patterns::DuplicatedInputs duplicated_inputs{gpd.mutable_pattern(),
-                                               "duplicated_inputs"};
-  duplicated_inputs();
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    GET_IR_NODE_FROM_SUBGRAPH(op, op, duplicated_inputs);
-    AddQuantizes(g, op, quantize_counter);
+  virtual ~Quanter() = default;
+
+ protected:
+  Graph& graph;
+  ir::Node* const op;
+
+  std::map<std::string, ir::Node*> xputs_map;
+  const VariableNameMap& op_xputs;
+
+  int counter = 0;
+
+  Quanter(Graph& graph, ir::Node* const op, const VariableNameMap& op_xputs)
+      : graph(graph), op(op), op_xputs(op_xputs){};
+
+  virtual bool IsNotPermittedOpType() const = 0;
+  virtual bool IsNotPermittedName(const std::string& input_name) const = 0;
+  virtual std::string get_op_type() const = 0;
+  virtual std::string get_op_edge() const = 0;
+  virtual void link_nodes(ir::Node* const physical_xput_node,
+                          ir::Node* const quant_op,
+                          ir::Node* const quant_x_node) = 0;
+  virtual void set_edge(const std::string& logical_xput_name,
+                        const std::vector<std::string>& quant_xput_names) = 0;
+
+  bool IsAlreadyLinked(const std::vector<std::string>& node_names,
+                       const std::string& node_name) const {
+    return std::find(node_names.begin(), node_names.end(), node_name) !=
+           node_names.end();
+  }
+
+  virtual ir::Node* create_quant_op(const std::string& input_name,
+                                    const std::string& output_name) const {
+    OpDesc op_desc;
+    op_desc.SetType(get_op_type());
+
+    op_desc.SetInput("Input", std::vector<std::string>({input_name}));
+    op_desc.SetOutput("Output", std::vector<std::string>({output_name}));
+    op_desc.SetAttr("Scale", 1.f);
+    op_desc.SetAttr("Shift", 0.0f);
+    op_desc.SetAttr("bfloat16", true);
+    op_desc.SetAttr("output_format", op->Op()->HasAttr("data_layout")
+                                         ? op->Op()->GetAttr("data_layout")
+                                         : std::string("NCHW"));
+    return graph.CreateOpNode(&op_desc);  // OpDesc will be copied.
+  }
+
+  void UnlinkNodes(ir::Node* a, ir::Node* b) const {
+    a->outputs.erase(std::remove(a->outputs.begin(), a->outputs.end(), b),
+                     a->outputs.end());
+    b->inputs.erase(std::remove(b->inputs.begin(), b->inputs.end(), a),
+                    b->inputs.end());
+  }
+};
+
+class Quantizer final : public Quanter {
+ public:
+  Quantizer(Graph* const graph, ir::Node* const op)
+      : Quanter(*graph, op, op->Op()->Inputs()) {
+    auto inputs = op->inputs;
+    PADDLE_ENFORCE_GE(
+        inputs.size(), 1,
+        platform::errors::InvalidArgument(
+            "OP(%s)'s inputs(%d) must be equal or greater than 1.", op->Name(),
+            inputs.size()));
+
+    for (auto input : inputs) xputs_map[input->Name()] = input;
   };
-  gpd(graph, handler);
-}
 
-// Adding quantize ops before all operators except Concat and Sum, which have
-// already been handled in AddReoderBeforeDuplicatedInputs
-void AddReoderBeforeSingleInputs(ir::Graph* graph, int& quantize_counter) {
-  GraphPatternDetector gpd;
-  patterns::FirstBfloat16Ops bfloat16_ops{gpd.mutable_pattern(),
-                                          "first_bfloat16_ops"};
-  bfloat16_ops();
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    GET_IR_NODE_FROM_SUBGRAPH(op_in, op_in, bfloat16_ops);
-    GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_ops);
-    if (op->Op()->Type() != "sum" && op->Op()->Type() != "concat") {
-      AddQuantize(g, op, op_in, quantize_counter);
-    }
+ protected:
+  bool IsNotPermittedOpType() const override { return false; }
+
+  // Checking whether a reorder from FP32 to BF16
+  // should be added before the input to the operator
+  bool IsNotPermittedName(const std::string& input_name) const override {
+    // Only the inputs listed in \"permitted_names\"
+    // requires quanitization before the bfloat16 operator.
+    // Other inputs, such as Filter and Bias are reordered in the kernel.
+    const std::vector<std::string> permitted_names = {"X", "Y", "Input",
+                                                      "ResidualData"};
+
+    return std::none_of(
+        permitted_names.begin(), permitted_names.end(),
+        [&input_name](const std::string& name) { return name == input_name; });
+  }
+
+  std::string get_op_type() const override { return "quantize"; };
+  std::string get_op_edge() const override { return "out"; };
+
+  void link_nodes(ir::Node* const physical_xput_node, ir::Node* const quant_op,
+                  ir::Node* const quant_x_node) override {
+    UnlinkNodes(physical_xput_node, op);
+    IR_NODE_LINK_TO(physical_xput_node, quant_op);
+    IR_NODE_LINK_TO(quant_op, quant_x_node);
+    IR_NODE_LINK_TO(quant_x_node, op);
+  }
+
+  void set_edge(const std::string& logical_xput_name,
+                const std::vector<std::string>& quant_xput_names) override {
+    op->Op()->SetInput(logical_xput_name, quant_xput_names);
+  }
+};
+
+class DeQuantizer final : public Quanter {
+ public:
+  DeQuantizer(Graph* const graph, ir::Node* const op)
+      : Quanter(*graph, op, op->Op()->Outputs()) {
+    auto outputs = op->outputs;
+    PADDLE_ENFORCE_GE(
+        outputs.size(), 1,
+        platform::errors::InvalidArgument(
+            "OP(%s)'s outputs(%d) must be equal or greater than 1.", op->Name(),
+            outputs.size()));
+
+    for (auto output : outputs) xputs_map[output->Name()] = output;
   };
-  gpd(graph, handler);
-}
 
-void CPUBFloat16Pass::SetInputDataType(ir::Graph* graph) const {
-  int quantize_counter = 0;
-  AddReoderBeforeDuplicatedInputs(graph, quantize_counter);
-  AddReoderBeforeSingleInputs(graph, quantize_counter);
-  PrettyLogDetail("---    added %d quantize ops before bfloat16 op",
-                  quantize_counter);
-}
+ protected:
+  bool IsNotPermittedOpType() const override {
+    // Prior_box operator output is always FP32 so no dequantization is needed.
+    return op->Op()->Type() == "prior_box";
+  }
 
-void AddDequantize(Graph* g, ir::Node* op, ir::Node* op_out,
-                   int& dequantize_counter) {
-  if (op->Op()->Type() == "prior_box") return;
-
-  // Find the name of the output linking op to op_out
-  std::vector<std::string> output_names;
-  for (auto name : op->Op()->OutputNames())
-    for (auto output_name : op->Op()->Output(name))
-      if (output_name == op_out->Name() && IsPermittedOutputName(name))
-        output_names.push_back(name);
-
-  if (output_names.empty()) return;
-
-  VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in"));
-  auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc);
-
-  OpDesc deq_desc;
-  deq_desc.SetType("dequantize");
-  deq_desc.SetInput("Input",
-                    std::vector<std::string>({dequantize_in_node->Name()}));
-  deq_desc.SetOutput("Output", std::vector<std::string>({op_out->Name()}));
-  deq_desc.SetAttr("Scale", 1.0f);
-  deq_desc.SetAttr("Shift", 0.0f);
-  auto dequantize_op = g->CreateOpNode(&deq_desc);  // OpDesc will be copied.
-
-  for (auto name = output_names.begin(); name < output_names.end(); name++)
-    op->Op()->SetOutput(*name,
-                        std::vector<std::string>({dequantize_in_node->Name()}));
-
-  UnlinkNodes(op, op_out);
-  IR_NODE_LINK_TO(op, dequantize_in_node);
-  IR_NODE_LINK_TO(dequantize_in_node, dequantize_op);
-  IR_NODE_LINK_TO(dequantize_op, op_out);
-
-  dequantize_counter++;
-}
+  // Checking whether a reorder from BF16 to FP32
+  // should be added after the output to the operator
+  bool IsNotPermittedName(const std::string& output_name) const override {
+    // XShape is output in transpose2 and reshape2 operators used to store the
+    // shape and lod of X. So this output do not need dequantize before.
+    return (output_name == "XShape");
+  }
+
+  std::string get_op_type() const override { return "dequantize"; };
+  std::string get_op_edge() const override { return "in"; };
 
-void AddDequantizes(Graph* g, ir::Node* op, int& dequantize_counter) {
-  auto outputs = op->outputs;
-  PADDLE_ENFORCE_GE(outputs.size(), 1,
-                    platform::errors::InvalidArgument(
-                        "OP(%s)'s outputs(%d) must be equal or greater than 1.",
-                        op->Name(), outputs.size()));
-  PADDLE_ENFORCE_EQ(op->inputs.size(), 1,
-                    platform::errors::InvalidArgument(
-                        "OP(%s)'s inputs(%d) must be equal to 1.", op->Name(),
-                        op->inputs.size()));
-
-  OpDesc deq_desc;
-  deq_desc.SetType("dequantize");
-
-  std::vector<Node*> dequantize_in_nodes(outputs.size());
-  std::vector<std::string> dequantize_in_node_names(outputs.size());
-
-  for (size_t i = 0; i < outputs.size(); i++) {
-    VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in"));
-    dequantize_in_nodes[i] = g->CreateVarNode(&dequantize_in_desc);
-    dequantize_in_node_names[i] = dequantize_in_nodes[i]->Name();
-
-    deq_desc.SetInput("Input",
-                      std::vector<std::string>({dequantize_in_node_names[i]}));
-    deq_desc.SetOutput("Output",
-                       std::vector<std::string>({outputs[i]->Name()}));
-
-    deq_desc.SetAttr("Scale", 1.f);
-    deq_desc.SetAttr("Shift", 0.0f);
-    deq_desc.SetAttr("bfloat16", true);
-    deq_desc.SetAttr("output_format", op->Op()->HasAttr("data_layout")
-                                          ? op->Op()->GetAttr("data_layout")
-                                          : std::string("NCHW"));
-    auto dequantize_op = g->CreateOpNode(&deq_desc);  // OpDesc will be copied.
-
-    UnlinkNodes(op, outputs[i]);
-    IR_NODE_LINK_TO(op, dequantize_in_nodes[i]);
-    IR_NODE_LINK_TO(dequantize_in_nodes[i], dequantize_op);
-    IR_NODE_LINK_TO(dequantize_op, outputs[i]);
-
-    dequantize_counter++;
+  void link_nodes(ir::Node* const physical_xput_node, ir::Node* const quant_op,
+                  ir::Node* const quant_x_node) override {
+    UnlinkNodes(op, physical_xput_node);
+    IR_NODE_LINK_TO(quant_op, physical_xput_node);
+    IR_NODE_LINK_TO(quant_x_node, quant_op);
+    IR_NODE_LINK_TO(op, quant_x_node);
   }
 
-  op->Op()->SetOutput("Out", dequantize_in_node_names);
-}
+  void set_edge(const std::string& logical_xput_name,
+                const std::vector<std::string>& quant_xput_names) override {
+    op->Op()->SetOutput(logical_xput_name, quant_xput_names);
+  }
 
-// Operators like split have a single output name Out, which actually
-// consists of multiple outputs. Such operators require a different way to find
-// pattern and add dequantize ops.
-void AddReoderAfterDuplicatedOutputs(ir::Graph* graph,
-                                     int& dequantize_counter) {
-  GraphPatternDetector gpd;
-  patterns::DuplicatedOutputs duplicated_outputs{gpd.mutable_pattern(),
-                                                 "duplicated_outputs"};
-  duplicated_outputs();
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    GET_IR_NODE_FROM_SUBGRAPH(op, op, duplicated_outputs);
-    AddDequantizes(g, op, dequantize_counter);
-  };
-  gpd(graph, handler);
+  ir::Node* create_quant_op(const std::string& input_name,
+                            const std::string& output_name) const override {
+    return Quanter::create_quant_op(output_name, input_name);
+  }
+};
 }
+using string::PrettyLogDetail;
+
+void CPUBFloat16Pass::ApplyImpl(ir::Graph* graph) const {
+  int quantize_counter = 0;
+  int dequantize_counter = 0;
 
-// Adding dequantize ops after all operators except split, which has
-// already been handled in AddReoderAfterDuplicatedOutputs
-void AddReoderAfterSingleOutputs(ir::Graph* graph, int& dequantize_counter) {
   GraphPatternDetector gpd;
-  patterns::LastBfloat16Ops bfloat16_ops{gpd.mutable_pattern(),
-                                         "last_bfloat16_ops"};
-  bfloat16_ops();
+  patterns::Bloat16Ops Bloat16Ops{gpd.mutable_pattern(), "Bloat16Ops"};
+  Bloat16Ops();
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    GET_IR_NODE_FROM_SUBGRAPH(op_out, op_out, bfloat16_ops);
-    GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_ops);
-    if (op->Op()->Type() != "split") {
-      AddDequantize(g, op, op_out, dequantize_counter);
-    }
+                     Graph* graph) {
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, Bloat16Ops);
+
+    Quantizer quantizer(graph, op);
+    quantizer.AddQuantOps();
+    quantize_counter += quantizer.get_counter();
+
+    DeQuantizer dequantizer(graph, op);
+    dequantizer.AddQuantOps();
+    dequantize_counter += dequantizer.get_counter();
   };
   gpd(graph, handler);
-}
 
-void CPUBFloat16Pass::SetOutputDataType(ir::Graph* graph) const {
-  int dequantize_counter = 0;
-  AddReoderAfterDuplicatedOutputs(graph, dequantize_counter);
-  AddReoderAfterSingleOutputs(graph, dequantize_counter);
+  PrettyLogDetail("---    added %d quantize ops before bfloat16 op",
+                  quantize_counter);
   PrettyLogDetail("---    added %d dequantize ops after bfloat16 op",
                   dequantize_counter);
 }
 
-void CPUBFloat16Pass::ApplyImpl(ir::Graph* graph) const {
-  SetInputDataType(graph);
-  SetOutputDataType(graph);
-}
-
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h
index 3a7271f7ddc59..69c7ce35162ff 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h
@@ -24,8 +24,6 @@ namespace ir {
 
 class CPUBFloat16Pass : public Pass {
  protected:
-  void SetInputDataType(ir::Graph* graph) const;
-  void SetOutputDataType(ir::Graph* graph) const;
   void ApplyImpl(ir::Graph* graph) const override;
 };
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
index d89891ec3c857..fc7a53c4e7923 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
@@ -27,8 +27,16 @@ namespace ir {
 
 using string::PrettyLogDetail;
 
-void CPUBfloat16PlacementPass::SetMkldnnDataType(
-    ir::Graph* graph, int* bfloat16_operators) const {
+void CPUBfloat16PlacementPass::ApplyImpl(ir::Graph* graph) const {
+  int bfloat16_operators = 0;
+  bfloat16_operators += SetMkldnnDataType(graph);
+  bfloat16_operators -= RemoveOrphanedOperators(graph);
+  bfloat16_operators -= RemoveUnsupportedOperators(graph);
+  PrettyLogDetail("---    marked %d operators to bfloat16 ",
+                  bfloat16_operators);
+}
+
+int CPUBfloat16PlacementPass::SetMkldnnDataType(ir::Graph* graph) const {
   const auto& op_types_list =
       Get<std::unordered_set<std::string>>("bfloat16_enabled_op_types");
   // set mkldnn_data_type to bfloat16 to all operators that are in
@@ -39,6 +47,7 @@ void CPUBfloat16PlacementPass::SetMkldnnDataType(
                                                          "bfloat16_placement"};
   bfloat16_placement_pattern(op_types_list);
 
+  int detected_operators = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
     GET_IR_NODE_FROM_SUBGRAPH(op_in, op_in, bfloat16_placement_pattern);
@@ -50,58 +59,58 @@ void CPUBfloat16PlacementPass::SetMkldnnDataType(
     if ((op->Op()->HasAttr("mkldnn_data_type") ||
          op->Op()->HasProtoAttr("mkldnn_data_type")) &&
         !platform::HasOpINT8DataType(op->Op())) {
+      VLOG(4) << "---    marked " << op->Op()->Type()
+              << " operator to bfloat16 ";
       op->Op()->SetAttr("mkldnn_data_type", std::string("bfloat16"));
-      (*bfloat16_operators)++;
+      detected_operators++;
     }
   };
   gpd(graph, handler);
+  return detected_operators;
 }
 
-void CPUBfloat16PlacementPass::RemoveOrphanedOperators(
-    ir::Graph* graph, int* bfloat16_operators) const {
+int CPUBfloat16PlacementPass::RemoveOrphanedOperators(ir::Graph* graph) const {
   // find orphaned bfloat16 operator that is between two float32 operators
   // revert mkldnn_data_type attr to float32
   GraphPatternDetector gpd;
   patterns::OrphanedBfloat16 orphaned_bfloat16_pattern{gpd.mutable_pattern(),
                                                        "orphaned_bfloat16"};
   orphaned_bfloat16_pattern();
+  int detected_operators = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
     GET_IR_NODE_FROM_SUBGRAPH(op, op, orphaned_bfloat16_pattern);
 
     op->Op()->SetAttr("mkldnn_data_type", std::string("float32"));
-    bfloat16_operators--;
+    VLOG(4) << "---  demarked " << op->Op()->Type() << " operator to bfloat16 ";
+    detected_operators++;
   };
   gpd(graph, handler);
+  return detected_operators;
 }
 
-void CPUBfloat16PlacementPass::RemoveUnsupportedOperators(
-    ir::Graph* graph, int* bfloat16_operators) const {
+int CPUBfloat16PlacementPass::RemoveUnsupportedOperators(
+    ir::Graph* graph) const {
   // now quantize is supported FP32 only, so try to find
   // bfloat16 operator that input type is not FP32
   GraphPatternDetector gpd;
   patterns::UnsupportedBfloat16 unsupported_bfloat16_pattern{
       gpd.mutable_pattern(), "unsupported_bfloat16"};
   unsupported_bfloat16_pattern();
+  int detected_operators = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
     GET_IR_NODE_FROM_SUBGRAPH(prev_out, prev_out, unsupported_bfloat16_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(op, op, unsupported_bfloat16_pattern);
     if ((prev_out->Var()->GetDataType() != proto::VarType::FP32)) {
       op->Op()->SetAttr("mkldnn_data_type", std::string("float32"));
-      bfloat16_operators--;
+      VLOG(4) << "---  demarked " << op->Op()->Type()
+              << " operator to bfloat16 ";
+      detected_operators++;
     }
   };
   gpd(graph, handler);
-}
-
-void CPUBfloat16PlacementPass::ApplyImpl(ir::Graph* graph) const {
-  int bfloat16_operators = 0;
-  SetMkldnnDataType(graph, &bfloat16_operators);
-  RemoveOrphanedOperators(graph, &bfloat16_operators);
-  RemoveUnsupportedOperators(graph, &bfloat16_operators);
-  PrettyLogDetail("---    marked %d operators to bfloat16 ",
-                  bfloat16_operators);
+  return detected_operators;
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
index facc4c4c55221..63848298a879a 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
@@ -26,14 +26,11 @@ namespace ir {
  */
 class CPUBfloat16PlacementPass : public Pass {
  protected:
-  void SetMkldnnDataType(ir::Graph* graph, int* bfloat16_operators) const;
-
-  void RemoveOrphanedOperators(ir::Graph* graph, int* bfloat16_operators) const;
-
-  void RemoveUnsupportedOperators(ir::Graph* graph,
-                                  int* bfloat16_operators) const;
-
   void ApplyImpl(ir::Graph* graph) const override;
+
+  int SetMkldnnDataType(ir::Graph* graph) const;
+  int RemoveOrphanedOperators(ir::Graph* graph) const;
+  int RemoveUnsupportedOperators(ir::Graph* graph) const;
 };
 
 }  // namespace ir

From 687219fee50d7e0e4a37f12d4ee3d8c3cbac7ec0 Mon Sep 17 00:00:00 2001
From: WangXi <wangxi16@baidu.com>
Date: Thu, 28 Apr 2022 16:05:48 +0800
Subject: [PATCH 09/10] fix FusedResidualDropoutBias nan in v100 (#42344)

---
 .../operators/fused/fused_dropout_common.h    | 14 +++++++++++---
 .../fused/fused_residual_dropout_bias_test.cu | 19 +++++++++++++++++++
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h
index 6bf3a7114f4ce..0fe76fa23a637 100644
--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -43,9 +43,17 @@ inline platform::GpuLaunchConfig Get1DBlocksAnd2DGrids(
     const platform::CUDADeviceContext &ctx, const uint32_t rows,
     const uint32_t cols, const int vec_size) {
   const uint32_t tmp_cols = cols / vec_size;
-  int threads = std::max(
-      static_cast<uint32_t>(32),
-      std::min(tmp_cols, static_cast<uint32_t>(ctx.GetMaxThreadsPerBlock())));
+  // NOTE(wangxi): We set max_block_size to 512, for `FusedResidualDropoutBias`
+  // needs too many register resources. If data_type is float16, CUDA
+  // error(701) will occur when block_size is 1024. Which error is
+  // 'cudaErrorLaunchOutOfResources', this indicates that a launch did not
+  // occur because it did not have appropriate resources.
+  // Of course, this kernel can be optimized later to reduce the use
+  // of registers.
+  int threads =
+      std::max(static_cast<uint32_t>(32),
+               std::min(tmp_cols, static_cast<uint32_t>(std::min(
+                                      ctx.GetMaxThreadsPerBlock(), 512))));
   const auto blocks_x =
       std::max(static_cast<uint32_t>(1), (tmp_cols + threads - 1) / threads);
   const auto blocks_y = std::max(static_cast<uint32_t>(1), rows);
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
index 5dff5e2225f4f..caceac1228e0a 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
@@ -147,6 +147,7 @@ struct TestFusedResidualDropoutBias {
                  dropout_prob, is_upscale_in_train, is_test);
     }
     ctx->Wait();
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     // add residual
     for (int i = 0; i < rows; i++) {
       for (int j = 0; j < cols; j++) {
@@ -186,6 +187,7 @@ struct TestFusedResidualDropoutBias {
         src.data<T>(), residual.data<T>(), bias_ptr, mask.data<uint8_t>(),
         out.data<T>(), *ctx);
     ctx->Wait();
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
   }
 
   void FusedBackward() {
@@ -313,3 +315,20 @@ TEST(FusedDropout, GPUFusedResidualDropoutBiasLargeShape) {
   test.CheckOut(static_cast<float>(1e-5));
   test.CheckGrad(static_cast<float>(1e-3));
 }
+
+TEST(FusedDropout, GPUFusedResidualDropoutBiasLargeShapeFp16) {
+  // Used to test that `cudaErrorLaunchOutOfResources` will not occur
+  int rows = 1;
+  int cols = 12288;
+  if (std::getenv("_rows") != nullptr) {
+    rows = atoi(std::getenv("_rows"));
+  }
+  if (std::getenv("_cols") != nullptr) {
+    cols = atoi(std::getenv("_cols"));
+  }
+  TestFusedResidualDropoutBias<platform::float16> test(rows, cols, 0, 0.0, true,
+                                                       true);
+  test.Run();
+  test.CheckOut(static_cast<platform::float16>(1e-1));
+  test.CheckGrad(static_cast<platform::float16>(1e-1));
+}

From 7cb4953941230dc109a094c6baefaaff7dda515c Mon Sep 17 00:00:00 2001
From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com>
Date: Thu, 28 Apr 2022 16:06:34 +0800
Subject: [PATCH 10/10] Suppport more scenes for fused_fast_ln (#42282)

* Suppport more scenes for fused_fast_ln

* fix
---
 .../fused_layernorm_residual_dropout_bias.h   | 165 +++++++++++++-----
 1 file changed, 119 insertions(+), 46 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
index d53a24a57e3cc..aa613dd3f5ce0 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -156,9 +156,9 @@ __global__ void FusedLayernormResidualDropoutBias(
 }
 
 /*
-* @brief layernorm(residual + dropout(x));
+ * @brief layernorm(residual + dropout(x));
  * Conditions:
- * (1) The number of cols is 1024;
+ * (1) The number of cols is 768/1024/4096;
  * (2) layer_norm scale and bias is not null;
  * (3) linear bias is null;
  * @param
@@ -166,6 +166,7 @@ __global__ void FusedLayernormResidualDropoutBias(
  * cols: 1024
  * x_: [rows, cols], inputs
  * residual_:[rows, cols]
+ * bias_: [cols], linear bias, can be null
  * gamma_: [cols]: layernorm scale, not null
  * beta_: [cols], layernorm bias, not null
  * mask_out_: [rows, cols], dropout result
@@ -173,7 +174,7 @@ __global__ void FusedLayernormResidualDropoutBias(
  * y_: [rows, cols], layernorm result
  * mean_out_: [rows]: layernorm means
  * var_out_: [rows]: layernorm vars
-*/
+ */
 template <
     typename T, typename U, typename ScaleT = U, typename MaskType = uint8_t,
     int VecSize = 8, int WARPS_M = 4, int WARPS_N = 1, int BYTES_PER_LDG = 16,
@@ -182,14 +183,16 @@ template <
     int THREADS_PER_CTA = WARPS_M *THREADS_PER_ROW, int ROWS_PER_CTA = WARPS_M,
     int ELTS_PER_ROW_PER_CTA = THREADS_PER_ROW *VecSize,
     int LDGS = ELTS_PER_ROW / ELTS_PER_ROW_PER_CTA>
-__global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
+__global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
     int rows, int cols, uint64_t seed, const float dropout_prob,
     const bool is_upscale_in_train, const bool is_test,
     const uint64_t increment, const float epsilon, const T *__restrict__ x_ptr,
-    const T *__restrict__ residual_ptr, const ScaleT *__restrict__ gamma_ptr,
-    const ScaleT *__restrict__ beta_ptr, MaskType *__restrict__ mask_out_ptr,
-    U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr,
-    T *__restrict__ residual_out_ptr, T *__restrict__ y_ptr) {
+    const T *__restrict__ residual_ptr, const T *__restrict__ bias_ptr,
+    const ScaleT *__restrict__ gamma_ptr, const ScaleT *__restrict__ beta_ptr,
+    MaskType *__restrict__ mask_out_ptr, U *__restrict__ mean_out_ptr,
+    U *__restrict__ var_out_ptr, T *__restrict__ residual_out_ptr,
+    T *__restrict__ y_ptr) {
+  __shared__ U smem[WARPS_M * WARPS_N];
   using Vec = phi::AlignedVector<T, VecSize>;
   using Vec_scale = phi::AlignedVector<ScaleT, VecSize>;
   using MaskStoreT = phi::AlignedVector<MaskType, VecSize>;
@@ -204,12 +207,22 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
   const int c = warp_n * THREADS_PER_WARP + lane;  // lane
   const int r = bidx * ROWS_PER_CTA + warp_m;      // row id
 
-  int idx = r * LN_NUM_COLS + c;
+  int idx = r * ELTS_PER_ROW + c;
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx, increment, &state);
 
   T factor = GetFactor<T>(dropout_prob, is_upscale_in_train, is_test);
 
+  // bias
+  Vec bias[LDGS];
+  if (bias_ptr != nullptr) {
+#pragma unroll
+    for (int it = 0, col = c; it < LDGS; it++) {
+      phi::Load<T, VecSize>(bias_ptr + col * VecSize, &bias[it]);
+      col += THREADS_PER_ROW;
+    }
+  }
+
   Vec_scale gamma[LDGS];
   Vec_scale beta[LDGS];
 #pragma unroll
@@ -219,14 +232,14 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
     col += THREADS_PER_ROW;
   }
 
-  constexpr U rn = 1.f / U(LN_NUM_COLS);
+  constexpr U rn = 1.f / U(ELTS_PER_ROW);
   for (int row = r; row < rows; row += gridDim.x * ROWS_PER_CTA) {
     Vec x[LDGS];
     Vec residual[LDGS];
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      phi::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]);
-      phi::Load<T, VecSize>(residual_ptr + row * LN_NUM_COLS + col * VecSize,
+      phi::Load<T, VecSize>(x_ptr + row * ELTS_PER_ROW + col * VecSize, &x[it]);
+      phi::Load<T, VecSize>(residual_ptr + row * ELTS_PER_ROW + col * VecSize,
                             &residual[it]);
       col += THREADS_PER_ROW;
     }
@@ -255,14 +268,28 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
 
     // 4 * 8
     U xf[LDGS * VecSize];
+    if (bias_ptr != nullptr) {
 #pragma unroll
-    for (int it = 0; it < LDGS; it++) {
+      for (int it = 0; it < LDGS; it++) {
 #pragma unroll
-      for (int jt = 0; jt < VecSize; jt++) {
-        // dropout(x) + residual
-        x[it][jt] = x[it][jt] * static_cast<T>(mask_vec[it][jt]) * factor +
-                    residual[it][jt];
-        xf[it * VecSize + jt] = U(x[it][jt]);
+        for (int jt = 0; jt < VecSize; jt++) {
+          // dropout(x) + residual
+          x[it][jt] = (x[it][jt] + bias[it][jt]) *
+                          static_cast<T>(mask_vec[it][jt]) * factor +
+                      residual[it][jt];
+          xf[it * VecSize + jt] = U(x[it][jt]);
+        }
+      }
+    } else {
+#pragma unroll
+      for (int it = 0; it < LDGS; it++) {
+#pragma unroll
+        for (int jt = 0; jt < VecSize; jt++) {
+          // dropout(x) + residual
+          x[it][jt] = x[it][jt] * static_cast<T>(mask_vec[it][jt]) * factor +
+                      residual[it][jt];
+          xf[it * VecSize + jt] = U(x[it][jt]);
+        }
       }
     }
 
@@ -270,9 +297,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
       phi::Store<T, VecSize>(
-          x[it], residual_out_ptr + row * LN_NUM_COLS + col * VecSize);
+          x[it], residual_out_ptr + row * ELTS_PER_ROW + col * VecSize);
       phi::Store<MaskType, VecSize>(
-          mask_vec[it], mask_out_ptr + row * LN_NUM_COLS + col * VecSize);
+          mask_vec[it], mask_out_ptr + row * ELTS_PER_ROW + col * VecSize);
       col += THREADS_PER_ROW;
     }
 
@@ -289,6 +316,22 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
     for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
       mu_local += __shfl_xor_sync(uint32_t(-1), mu_local, it);
     }
+    if (WARPS_N > 1) {
+      if (lane == 0) {
+        smem[warp_m * WARPS_N + warp_n] = mu_local;
+      }
+      __syncthreads();
+      if (tidx == 0) {
+        mu_local = 0.f;
+#pragma unroll
+        for (int it = 0; it < WARPS_N; ++it) {
+          mu_local += smem[warp_m * WARPS_N + it];
+        }
+        smem[warp_m] = mu_local;
+      }
+      __syncthreads();
+      mu_local = smem[warp_m];
+    }
     mu_local *= rn;
     if (lane == 0) {
       mean_out_ptr[row] = mu_local;
@@ -308,6 +351,22 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
     for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
       var_local += __shfl_xor_sync(uint32_t(-1), var_local, it);
     }
+    if (WARPS_N > 1) {
+      if (lane == 0) {
+        smem[warp_m * WARPS_N + warp_n] = var_local;
+      }
+      __syncthreads();
+      if (tidx == 0) {
+        var_local = 0.f;
+#pragma unroll
+        for (int it = 0; it < WARPS_N; ++it) {
+          var_local += smem[warp_m * WARPS_N + it];
+        }
+        smem[warp_m] = var_local;
+      }
+      __syncthreads();
+      var_local = smem[warp_m];
+    }
     U rsigma = rsqrtf(var_local * rn + epsilon);
     if (lane == 0) {
       // Note: the stored var is different for paddle(ln) and apex (fast ln).
@@ -332,7 +391,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
 
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      phi::Store<T, VecSize>(x[it], y_ptr + row * LN_NUM_COLS + col * VecSize);
+      phi::Store<T, VecSize>(x[it], y_ptr + row * ELTS_PER_ROW + col * VecSize);
       col += THREADS_PER_ROW;
     }
   }
@@ -390,12 +449,37 @@ void LaunchLayernormResidualDropoutBias(
     return;
   }
 
-  bool can_call_1024_kernel = false;
-  if (cols == 1024 && scale != nullptr && layernorm_bias != nullptr &&
-      bias == nullptr) {
-    can_call_1024_kernel = true;
+#define LAUNCH_FUSED_FAST_LN_KERNEL_BASE(cols)                                \
+  case (cols): {                                                              \
+    constexpr int WARPS_N = cols < 1024 ? 1 : (cols / 1024);                  \
+    constexpr int WARPS_M = 4 / WARPS_N;                                      \
+    const int THREADS_PER_WARP = 32;                                          \
+    const int BYTES_PER_LDG = 16;                                             \
+    const int VecSize = BYTES_PER_LDG / sizeof(T);                            \
+    const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;         \
+    const int ROWS_PER_CTA = WARPS_M;                                         \
+    const int grid =                                                          \
+        static_cast<int>(std::ceil(rows / static_cast<float>(ROWS_PER_CTA))); \
+    fused_fast_ln_fwd_kernel<                                                 \
+        T, U, LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>, uint8_t,     \
+        VecSize, WARPS_M, WARPS_N, BYTES_PER_LDG,                             \
+        cols><<<grid, THREADS_PER_CTA, 0, ctx.stream()>>>(                    \
+        rows, cols, seed, dropout_prob, is_upscale_in_train, is_test,         \
+        increment, epsilon, src, residual, bias, scale, layernorm_bias,       \
+        mask_data, mean, var, dst, layernorm_dst);                            \
+  } break
+
+#define LAUNCH_FUSED_FAST_LN_KERNEL       \
+  LAUNCH_FUSED_FAST_LN_KERNEL_BASE(768);  \
+  LAUNCH_FUSED_FAST_LN_KERNEL_BASE(1024); \
+  LAUNCH_FUSED_FAST_LN_KERNEL_BASE(4096)
+
+  bool can_call_fast_ln_kernel = false;
+  if ((cols == 768 || cols == 1024 || cols == 4096) && scale != nullptr &&
+      layernorm_bias != nullptr) {
+    can_call_fast_ln_kernel = true;
   }
-  VLOG(6) << "can_call_1024_kernel = " << can_call_1024_kernel;
+  VLOG(6) << "can_call_fast_ln_kernel = " << can_call_fast_ln_kernel;
 
   const int VecSize = MAX_CACHE_BYTES / sizeof(T);
   if (cols % VecSize != 0) {
@@ -407,26 +491,15 @@ void LaunchLayernormResidualDropoutBias(
         epsilon, src, residual, bias, scale, layernorm_bias, mask_data, dst,
         layernorm_dst, mean, var);
   } else {
-    if (can_call_1024_kernel) {
-      const int WARPS_M = 4;
-      const int WARPS_N = 1;
-      const int THREADS_PER_WARP = 32;
-      const int BYTES_PER_LDG = 16;
-      const int VecSize = BYTES_PER_LDG / sizeof(T);
-
-      const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;
-      const int ROWS_PER_CTA = WARPS_M;
-
-      // Note: the grid can not exceed max_grid of the gpu.
-      const int grid =
-          static_cast<int>(std::ceil(rows / static_cast<float>(ROWS_PER_CTA)));
-      fused_ln_fwd_1024_kernel<
-          T, U, LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>, uint8_t,
-          VecSize, WARPS_M, WARPS_N,
-          BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, ctx.stream()>>>(
-          rows, cols, seed, dropout_prob, is_upscale_in_train, is_test,
-          increment, epsilon, src, residual, scale, layernorm_bias, mask_data,
-          mean, var, dst, layernorm_dst);
+    if (can_call_fast_ln_kernel) {
+      switch (cols) {
+        LAUNCH_FUSED_FAST_LN_KERNEL;
+        default:
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Only when column is equal to 768/1024/4096 is supported for "
+              "now"));
+          break;
+      }
     } else {
       int blockDim = GetDesiredBlockDim(cols / VecSize);
       FusedLayernormResidualDropoutBias<