From eb28a36dea3998afababbd76803c7e2f8de7ea0b Mon Sep 17 00:00:00 2001
From: wuhuachaocoding <77733235+wuhuachaocoding@users.noreply.github.com>
Date: Fri, 24 Sep 2021 14:41:59 +0800
Subject: [PATCH 1/7] concat api support empty tensor. (#35845)

---
 python/paddle/fluid/layers/tensor.py | 2 ++
 1 file changed, 2 insertions(+)
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 3a0bbeb0fa341..abc84be12b38f 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -322,6 +322,8 @@ def concat(input, axis=0, name=None):
         if isinstance(axis, Variable):
             axis = axis.numpy()
             axis = axis.item(0)
+        if not isinstance(input, Variable):
+            input = [t for t in input if t.shape.count(0) == 0]
         return _C_ops.concat(input, 'axis', axis)
 
     check_type(input, 'input', (list, tuple, Variable), 'concat')

From 4f42e5d77c8b1083f73389cc555ff59c288c0535 Mon Sep 17 00:00:00 2001
From: Kaipeng Deng <dengkaipeng@baidu.com>
Date: Fri, 24 Sep 2021 15:36:10 +0800
Subject: [PATCH 2/7] fix undefined var in test_batch_sampler. test=develop
 (#35924)

---
 python/paddle/fluid/tests/unittests/test_batch_sampler.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_batch_sampler.py b/python/paddle/fluid/tests/unittests/test_batch_sampler.py
index 4c323a2511f5b..4c5338314afb1 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_sampler.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_sampler.py
@@ -22,6 +22,8 @@
                         RandomSampler, WeightedRandomSampler
 from paddle.io import DistributedBatchSampler
 
+IMAGE_SIZE = 32
+
 
 class RandomDataset(Dataset):
     def __init__(self, sample_num, class_num):
@@ -31,7 +33,7 @@ def __init__(self, sample_num, class_num):
     def __getitem__(self, idx):
         np.random.seed(idx)
         image = np.random.random([IMAGE_SIZE]).astype('float32')
-        label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+        label = np.random.randint(0, self.class_num - 1, (1, )).astype('int64')
         return image, label
 
     def __len__(self):

From 485b387d0db2b7f4592a2b13f9c68acba5fe7a60 Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Fri, 24 Sep 2021 09:47:36 +0200
Subject: [PATCH 3/7] [oneDNN] candidate fix to #34554 (#35884)

* - candidate fix

* - More fixes to #34554

* - another incosnstent fix to key

* - Remvoed unneeded line

* - matching the cache behaviour to other ops
---
 .../fluid/operators/mkldnn/conv_mkldnn_op.cc  | 21 +++----
 paddle/fluid/platform/mkldnn_reuse.h          | 56 ++++++++-----------
 2 files changed, 32 insertions(+), 45 deletions(-)

diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 09386fc31ee31..1b69dd7ea00c7 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -706,7 +706,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         platform::CreateKey(dev_ctx, src_tz, src_dt,
                             ctx.InputName("Input") + ctx.InputName("Filter"));
 
-    const std::string key_conv_pd = key + "@conv_pd";
     bool need_s8_to_u8 = false;
     std::shared_ptr<mkldnn::convolution_forward> conv_p;
     std::shared_ptr<mkldnn::memory> src_memory_p;
@@ -721,6 +720,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     // are merged/unified, this will disappear
     auto key_tid = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key);
 
+    const std::string key_conv_pd = key_tid + "@conv_pd";
     auto prim_key = key_tid + "@conv_p";
     auto dst_key = key_tid + "@dst_mem_p";
     auto src_key = key_tid + "@src_mem_p";
@@ -731,12 +731,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto src_reorder_key = key_tid + "@src_mem_preorder_p";
     auto residual_reorder_key = key_tid + "@residual_data_mem_preorder_p";
 
-    conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
-        dev_ctx.GetBlob(prim_key));
+    conv_pd =
+        std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(
+            dev_ctx.GetBlob(key_conv_pd));
 
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
 
-    if (conv_p == nullptr || !is_test) {
+    if (conv_pd == nullptr || !is_test) {
       float fuse_alpha = ctx.Attr<float>("fuse_alpha");
       float fuse_beta = ctx.Attr<float>("fuse_beta");
       bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
@@ -946,7 +947,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       }
 
       // create convolution op primitive
-      auto scale_bias_key = key + "@scale_bias";
       conv_p = handler->AcquireConvolution();
       if (bias) {
         const K* bias_data = bias->data<K>();
@@ -1000,13 +1000,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           dev_ctx.GetBlob(weights_key));
       dst_memory_p =
           std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(dst_key));
-      conv_pd =
-          std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(
-              dev_ctx.GetBlob(key_conv_pd));
-      if (conv_pd) {
-        handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx,
-                                                      mkldnn_engine, key));
-      }
+      conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
+          dev_ctx.GetBlob(prim_key));
+      handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx,
+                                                    mkldnn_engine, key));
 
       if (fuse_residual_conn) {
         auto residual_param = ctx.Input<Tensor>("ResidualData");
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 29a3f8e9dcd3c..d6ab9e50a066e 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -603,7 +603,6 @@ class MKLDNNHandler {
                 const std::string& base_key)
       : dev_ctx_(dev_ctx),
         engine_(engine),
-        key_common_(base_key),
         key_(platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, base_key)) {
     platform::MKLDNNDeviceContext::tls().log_lib_version();
   }
@@ -789,7 +788,6 @@ class MKLDNNHandler {
  protected:
   const MKLDNNDeviceContext& dev_ctx_;
   mkldnn::engine engine_;
-  std::string key_common_;
   std::string key_;
 };
 
@@ -1371,42 +1369,34 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
     // Conv PD has to be passed to Grad op that
     // may be exxecuted by diffrent thread, hence
     // for that one we use key that does not contain TID
-    const std::string key_conv_pd = key_common_ + "@conv_pd";
+    const std::string key_conv_pd = key_ + "@conv_pd";
 
     conv_pd_ = std::static_pointer_cast<typename forward_t::primitive_desc>(
         dev_ctx_.GetBlob(key_conv_pd));
 
     if (conv_pd_ == nullptr) {
-      static std::mutex acquire_barrier;
-      std::lock_guard<std::mutex> block_threads_until_finish_this_job(
-          acquire_barrier);
-
-      conv_pd_ = std::static_pointer_cast<typename forward_t::primitive_desc>(
-          dev_ctx_.GetBlob(key_conv_pd));
-      if (conv_pd_ == nullptr) {
-        mkldnn::memory::dims stride_dims = strides;
-        mkldnn::memory::dims dilations_dims = dilations;
-        auto mkldnn_paddings = ToMkldnnPadding(paddings);
-
-        auto conv_desc =
-            bias ? typename forward_t::desc(
-                       fwd_prop_kind, convolutional_algorithm<forward_t>::T,
-                       src, weights, *bias, dst, stride_dims, dilations_dims,
-                       mkldnn_paddings[0], mkldnn_paddings[1])
-                 : typename forward_t::desc(
-                       fwd_prop_kind, convolutional_algorithm<forward_t>::T,
-                       src, weights, dst, stride_dims, dilations_dims,
-                       mkldnn_paddings[0], mkldnn_paddings[1]);
-
-        mkldnn::primitive_attr conv_attr =
-            CreatePostOps(fuse_activation, fuse_alpha, fuse_beta,
-                          fuse_residual_conn, output_shift_scale, sum_scale);
-
-        conv_pd_.reset(new typename forward_t::primitive_desc(
-            conv_desc, conv_attr, engine));
-        // Save conv_pd/src_memory/weights_memory for backward pass
-        dev_ctx_.SetBlob(key_conv_pd, conv_pd_);
-      }
+      mkldnn::memory::dims stride_dims = strides;
+      mkldnn::memory::dims dilations_dims = dilations;
+      auto mkldnn_paddings = ToMkldnnPadding(paddings);
+
+      auto conv_desc =
+          bias ? typename forward_t::desc(
+                     fwd_prop_kind, convolutional_algorithm<forward_t>::T, src,
+                     weights, *bias, dst, stride_dims, dilations_dims,
+                     mkldnn_paddings[0], mkldnn_paddings[1])
+               : typename forward_t::desc(
+                     fwd_prop_kind, convolutional_algorithm<forward_t>::T, src,
+                     weights, dst, stride_dims, dilations_dims,
+                     mkldnn_paddings[0], mkldnn_paddings[1]);
+
+      mkldnn::primitive_attr conv_attr =
+          CreatePostOps(fuse_activation, fuse_alpha, fuse_beta,
+                        fuse_residual_conn, output_shift_scale, sum_scale);
+
+      conv_pd_.reset(
+          new typename forward_t::primitive_desc(conv_desc, conv_attr, engine));
+      // Save conv_pd/src_memory/weights_memory for backward pass
+      dev_ctx_.SetBlob(key_conv_pd, conv_pd_);
     }
 
     return conv_pd_;

From 82f255d0a5ab93b8996897cb8d6a28484694c91a Mon Sep 17 00:00:00 2001
From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com>
Date: Fri, 24 Sep 2021 03:14:07 -0500
Subject: [PATCH 4/7] add pool2d convert test (#35923)

* add pool2d convert test

* modify error

* modify error

* modify error

* modify error

* modify error

* modify error
---
 .../inference/tensorrt/convert/pool2d_op.cc   |  13 +
 paddle/fluid/inference/tensorrt/op_teller.cc  |  20 ++
 .../test_trt_convert_anchor_generator.py      | 116 +++++++++
 .../test_trt_convert_conv2d_transpose.py      | 227 ++++++++++++++++++
 .../test_trt_convert_depthwise_conv2d.py      | 203 ++++++++++++++++
 ..._trt_convert_depthwise_conv2d_transpose.py | 191 +++++++++++++++
 .../ir/inference/test_trt_convert_pool2d.py   | 148 ++++++++++++
 7 files changed, 918 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py

diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index 90d6392fd6404..1898f28c73ad0 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -87,6 +87,10 @@ class Pool2dOpConverter : public OpConverter {
     bool adaptive = false;
     if (op_desc.HasAttr("adaptive"))
       adaptive = BOOST_GET_CONST(bool, op_desc.GetAttr("adaptive"));
+    std::string padding_algorithm = "EXPLICIT";
+    if (op_desc.HasAttr("padding_algorithm"))
+      padding_algorithm =
+          BOOST_GET_CONST(std::string, op_desc.GetAttr("padding_algorithm"));
 
     nvinfer1::PoolingType nv_pool_type = nvinfer1::PoolingType::kMAX;
     nvinfer1::ReduceOperation reduce_operation =
@@ -124,6 +128,9 @@ class Pool2dOpConverter : public OpConverter {
         pool_layer->setStride(nv_strides);
         pool_layer->setPadding(nv_paddings);
         pool_layer->setAverageCountExcludesPadding(exclusive);
+        if (padding_algorithm == "SAME") {
+          pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
+        }
         layer = pool_layer;
       } else if (global_pooling) {
         auto *reduce_layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *input1,
@@ -159,6 +166,9 @@ class Pool2dOpConverter : public OpConverter {
       auto output_name = op_desc.Output("Out")[0];
       pool_layer->setStride(nv_strides);
       pool_layer->setPadding(nv_paddings);
+      if (padding_algorithm == "SAME") {
+        pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
+      }
       pool_layer->setAverageCountExcludesPadding(exclusive);
       pool_layer->setName(("pool2d (Output: " + output_name + ")").c_str());
       pool_layer->getOutput(0)->setName(output_name.c_str());
@@ -198,6 +208,9 @@ class Pool2dOpConverter : public OpConverter {
                           "trt pool layer in converter could not be created."));
       pool_layer->setStride(nv_strides);
       pool_layer->setPadding(nv_paddings);
+      if (padding_algorithm == "SAME") {
+        pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
+      }
       pool_layer->setAverageCountExcludesPadding(exclusive);
       layer = pool_layer;
     } else {
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index ea630a9c6db90..5bfd2f1277795 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -172,6 +172,22 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       std::vector<int> paddings =
           BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
       if (paddings.size() > 2) return false;
+      if (desc.HasAttr("exclusive")) {
+        if (BOOST_GET_CONST(bool, desc.GetAttr("exclusive"))) {
+          std::vector<int> ksize =
+              BOOST_GET_CONST(std::vector<int>, desc.GetAttr("ksize"));
+          for (size_t i = 0; i < ksize.size(); i++) {
+            if (ksize[i] <= paddings[i]) {
+              VLOG(3) << "the padding size should be less than the filter size "
+                         "for exclusive-counting pooling.";
+              return false;
+            }
+          }
+        }
+      }
+      if (desc.HasAttr("ceil_mode")) {
+        if (BOOST_GET_CONST(bool, desc.GetAttr("ceil_mode"))) return false;
+      }
       if (desc.Input("X").size() != 1) {
         VLOG(3) << "TRT Pool2d expect 1 input, but got "
                 << desc.Input("X").size();
@@ -440,6 +456,10 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       }
     }
 
+    if (op_type == "anchor_generator") {
+      if (!with_dynamic_shape) return false;
+    }
+
     if (op_type == "yolo_box") {
       if (with_dynamic_shape) return false;
       bool has_attrs =
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py
new file mode 100644
index 0000000000000..bf457a9da40a8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+
+
+class TrtConvertAnchorGeneratorTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        def generate_input1(batch, attrs: List[Dict[str, Any]]):
+            return np.random.random([batch, 3, 64, 64]).astype(np.float32)
+
+        for batch in [1, 2, 4]:
+            for anchor_sizes in [[64.0, 128.0, 256.0, 512.0]]:
+                for aspect_ratios in [[0.5, 1, 2], [0.4, 1.2, 3]]:
+                    for variances in [[1.0, 1.0, 1.0, 1.0],
+                                      [0.5, 1.0, 0.5, 1.0]]:
+                        for stride in [[16.0, 16.0], [16.0, 32.0]]:
+                            for offset in [0.5, 0.8]:
+                                dics = [{
+                                    "anchor_sizes": anchor_sizes,
+                                    "aspect_ratios": aspect_ratios,
+                                    "variances": variances,
+                                    "stride": stride,
+                                    "offset": offset
+                                }]
+
+                                ops_config = [{
+                                    "op_type": "anchor_generator",
+                                    "op_inputs": {
+                                        "Input": ["input_data"]
+                                    },
+                                    "op_outputs": {
+                                        "Anchors": ["output_anchors"],
+                                        "Variances": ["output_variances"]
+                                    },
+                                    "op_attrs": dics[0]
+                                }]
+                                ops = self.generate_op_config(ops_config)
+
+                                program_config = ProgramConfig(
+                                    ops=ops,
+                                    weights={},
+                                    inputs={
+                                        "input_data": TensorConfig(
+                                            data_gen=partial(generate_input1,
+                                                             batch, dics))
+                                    },
+                                    outputs=[
+                                        "output_anchors", "output_variances"
+                                    ])
+
+                                yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
+            self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
+            self.dynamic_shape.opt_input_shape = {"input_data": [1, 3, 64, 64]}
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            return 1, 3
+
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
new file mode 100644
index 0000000000000..82dd492b5275f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
@@ -0,0 +1,227 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+
+
+class TrtConvertConv2dTransposeTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        inputs = program_config.inputs
+        weights = program_config.weights
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[
+                1] * attrs[0]['groups']:
+            return False
+
+        if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[0]:
+            return False
+
+        return True
+
+    def sample_program_configs(self):
+        self.trt_param.workspace_size = 1073741824
+
+        def generate_input1(batch, num_channels, attrs: List[Dict[str, Any]]):
+            return np.ones([batch, num_channels, 64, 64]).astype(np.float32)
+
+        def generate_weight1(num_channels, attrs: List[Dict[str, Any]]):
+            if attrs[0]['groups'] == 1:
+                return np.random.random(
+                    [num_channels, num_channels, 3, 3]).astype(np.float32)
+            else:
+                return np.random.random(
+                    [num_channels, int(num_channels / 2), 3, 3]).astype(
+                        np.float32)
+
+        for num_channels in [2, 4, 6]:
+            for batch in [1, 2, 4]:
+                for strides in [[1, 1], [2, 2], [1, 2]]:
+                    for paddings in [[0, 3], [1, 2, 3, 4]]:
+                        for groups in [2]:
+                            for padding_algorithm in [
+                                    'EXPLICIT', 'SAME', 'VALID'
+                            ]:
+                                for dilations in [[1, 1], [2, 2], [1, 2]]:
+                                    for data_format in ['NCHW']:
+
+                                        self.num_channels = num_channels
+                                        dics = [{
+                                            "data_fromat": data_format,
+                                            "dilations": dilations,
+                                            "padding_algorithm":
+                                            padding_algorithm,
+                                            "groups": groups,
+                                            "paddings": paddings,
+                                            "strides": strides,
+                                            "data_format": data_format,
+                                            "output_size": [],
+                                            "output_padding": []
+                                        }]
+
+                                        ops_config = [{
+                                            "op_type": "conv2d_transpose",
+                                            "op_inputs": {
+                                                "Input": ["input_data"],
+                                                "Filter": ["conv2d_weight"]
+                                            },
+                                            "op_outputs": {
+                                                "Output": ["output_data"]
+                                            },
+                                            "op_attrs": dics[0]
+                                        }]
+                                        ops = self.generate_op_config(
+                                            ops_config)
+
+                                        program_config = ProgramConfig(
+                                            ops=ops,
+                                            weights={
+                                                "conv2d_weight":
+                                                TensorConfig(data_gen=partial(
+                                                    generate_weight1,
+                                                    num_channels, dics))
+                                            },
+                                            inputs={
+                                                "input_data":
+                                                TensorConfig(data_gen=partial(
+                                                    generate_input1, batch,
+                                                    num_channels, dics))
+                                            },
+                                            outputs=["output_data"])
+
+                                        yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            if self.num_channels == 2:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 2, 32, 32],
+                    "output_data": [1, 24, 32, 32]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 2, 64, 64],
+                    "output_data": [4, 24, 64, 64]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [1, 2, 64, 64],
+                    "output_data": [1, 24, 64, 64]
+                }
+            elif self.num_channels == 4:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 4, 32, 32],
+                    "output_data": [1, 24, 32, 32]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 4, 64, 64],
+                    "output_data": [4, 24, 64, 64]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [1, 4, 64, 64],
+                    "output_data": [1, 24, 64, 64]
+                }
+            else:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 6, 32, 32],
+                    "output_data": [1, 24, 32, 32]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 6, 64, 64],
+                    "output_data": [4, 24, 64, 64]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [1, 6, 64, 64],
+                    "output_data": [1, 24, 64, 64]
+                }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), (1e-5, 1e-5)
+        self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), (1e-5, 1e-5)
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), (1e-5, 1e-5)
+        self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), (1e-5, 1e-5)
+
+    def add_skip_trt_case(self):
+        def teller1(program_config, predictor_config):
+            if program_config.ops[0].attrs[
+                    'padding_algorithm'] == "SAME" or program_config.ops[
+                        0].attrs['padding_algorithm'] == "VALID":
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "When padding_algorithm is 'SAME' or 'VALID', Trt dose not support. In this case, trt build error is caused by scale op."
+        )
+
+        def teller2(program_config, predictor_config):
+            if program_config.ops[0].attrs['dilations'][
+                    0] != 1 or program_config.ops[0].attrs['dilations'][1] != 1:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller2, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "When dilations's element is not equal 1, there are different behaviors between Trt and Paddle."
+        )
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+    def test_quant(self):
+        self.add_skip_trt_case()
+        self.run_test(quant=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py
new file mode 100644
index 0000000000000..e6b3aa30bf896
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+
+
+class TrtConvertDepthwiseConv2dTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        inputs = program_config.inputs
+        weights = program_config.weights
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[
+                1] * attrs[0]['groups']:
+            return False
+
+        return True
+
+    def sample_program_configs(self):
+        self.trt_param.workspace_size = 1073741824
+
+        def generate_input1(batch, attrs: List[Dict[str, Any]]):
+            if attrs[0]['groups'] == 1:
+                return np.ones([batch, 1, 64, 64]).astype(np.float32)
+            elif attrs[0]['groups'] == 2:
+                return np.ones([batch, 2, 64, 64]).astype(np.float32)
+            else:
+                return np.ones([batch, 3, 64, 64]).astype(np.float32)
+
+        def generate_weight1(attrs: List[Dict[str, Any]]):
+            return np.random.random([24, 1, 3, 3]).astype(np.float32)
+
+        for batch in [1, 2, 4]:
+            for strides in [[1, 1], [2, 2], [1, 2]]:
+                for paddings in [[0, 3], [1, 2, 3, 4]]:
+                    for groups in [1, 2, 3]:
+                        for padding_algorithm in ['EXPLICIT', 'SAME', 'VALID']:
+                            for dilations in [[1, 1], [2, 2], [1, 2]]:
+                                for data_format in ['NCHW']:
+
+                                    dics = [{
+                                        "data_fromat": data_format,
+                                        "dilations": dilations,
+                                        "padding_algorithm": padding_algorithm,
+                                        "groups": groups,
+                                        "paddings": paddings,
+                                        "strides": strides,
+                                        "data_format": data_format
+                                    }]
+
+                                    ops_config = [{
+                                        "op_type": "depthwise_conv2d",
+                                        "op_inputs": {
+                                            "Input": ["input_data"],
+                                            "Filter": ["conv2d_weight"]
+                                        },
+                                        "op_outputs": {
+                                            "Output": ["output_data"]
+                                        },
+                                        "op_attrs": dics[0]
+                                    }]
+                                    ops = self.generate_op_config(ops_config)
+
+                                    program_config = ProgramConfig(
+                                        ops=ops,
+                                        weights={
+                                            "conv2d_weight":
+                                            TensorConfig(data_gen=partial(
+                                                generate_weight1, dics))
+                                        },
+                                        inputs={
+                                            "input_data":
+                                            TensorConfig(data_gen=partial(
+                                                generate_input1, batch, dics))
+                                        },
+                                        outputs=["output_data"])
+
+                                    yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            if attrs[0]['groups'] == 1:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 1, 32, 32],
+                    "output_data": [1, 24, 32, 32]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 1, 64, 64],
+                    "output_data": [4, 24, 64, 64]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [1, 1, 64, 64],
+                    "output_data": [1, 24, 64, 64]
+                }
+            elif attrs[0]['groups'] == 2:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 2, 32, 32],
+                    "output_data": [1, 24, 32, 32]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 2, 64, 64],
+                    "output_data": [4, 24, 64, 64]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [1, 2, 64, 64],
+                    "output_data": [1, 24, 64, 64]
+                }
+            else:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 3, 32, 32],
+                    "output_data": [1, 24, 32, 32]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 3, 64, 64],
+                    "output_data": [4, 24, 64, 64]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [1, 3, 64, 64],
+                    "output_data": [1, 24, 64, 64]
+                }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), (1e-5, 1e-5)
+        self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), (1e-5, 1e-5)
+
+        # for dynamic_shape
+
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), (1e-5, 1e-5)
+        self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), (1e-5, 1e-5)
+
+    def add_skip_trt_case(self):
+        def teller1(program_config, predictor_config):
+            if program_config.ops[0].attrs[
+                    'padding_algorithm'] == "SAME" or program_config.ops[
+                        0].attrs['padding_algorithm'] == "VALID":
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "When padding_algorithm is 'SAME' or 'VALID', Trt dose not support. In this case, trt build error is caused by scale op."
+        )
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+    def test_quant(self):
+        self.add_skip_trt_case()
+        self.run_test(quant=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
new file mode 100644
index 0000000000000..473925c6cdb79
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+
+
+class TrtConvertDepthwiseConv2dTransposeTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        inputs = program_config.inputs
+        weights = program_config.weights
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[
+                1] * attrs[0]['groups']:
+            return False
+
+        if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[1]:
+            return False
+
+        if inputs['input_data'].shape[1] != attrs[0]['groups']:
+            return False
+
+        return True
+
+    def sample_program_configs(self):
+        self.trt_param.workspace_size = 1073741824
+
+        def generate_input1(batch, attrs: List[Dict[str, Any]]):
+            return np.ones(
+                [batch, attrs[0]['groups'], 64, 64]).astype(np.float32)
+
+        def generate_weight1(attrs: List[Dict[str, Any]]):
+            return np.random.random(
+                [attrs[0]['groups'], 1, 3, 3]).astype(np.float32)
+
+        for batch in [1, 2, 4]:
+            for strides in [[1, 1], [2, 2], [1, 2]]:
+                for paddings in [[0, 3], [1, 2, 3, 4]]:
+                    for groups in [1, 2, 3]:
+                        for padding_algorithm in ['EXPLICIT', 'SAME', 'VALID']:
+                            for dilations in [[1, 1], [2, 2], [1, 2]]:
+                                for data_format in ['NCHW']:
+
+                                    dics = [{
+                                        "data_fromat": data_format,
+                                        "dilations": dilations,
+                                        "padding_algorithm": padding_algorithm,
+                                        "groups": groups,
+                                        "paddings": paddings,
+                                        "strides": strides,
+                                        "data_format": data_format,
+                                        "output_size": [],
+                                        "output_padding": []
+                                    }]
+
+                                    ops_config = [{
+                                        "op_type": "conv2d_transpose",
+                                        "op_inputs": {
+                                            "Input": ["input_data"],
+                                            "Filter": ["conv2d_weight"]
+                                        },
+                                        "op_outputs": {
+                                            "Output": ["output_data"]
+                                        },
+                                        "op_attrs": dics[0]
+                                    }]
+                                    ops = self.generate_op_config(ops_config)
+
+                                    program_config = ProgramConfig(
+                                        ops=ops,
+                                        weights={
+                                            "conv2d_weight":
+                                            TensorConfig(data_gen=partial(
+                                                generate_weight1, dics))
+                                        },
+                                        inputs={
+                                            "input_data":
+                                            TensorConfig(data_gen=partial(
+                                                generate_input1, batch, dics))
+                                        },
+                                        outputs=["output_data"])
+
+                                    yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {
+                "input_data": [1, attrs[0]['groups'], 32, 32],
+                "output_data": [1, attrs[0]['groups'], 32, 32]
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input_data": [4, attrs[0]['groups'], 64, 64],
+                "output_data": [4, attrs[0]['groups'], 64, 64]
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input_data": [1, attrs[0]['groups'], 64, 64],
+                "output_data": [1, attrs[0]['groups'], 64, 64]
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), (1e-5, 1e-5)
+        self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), (1e-5, 1e-5)
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), (1e-5, 1e-5)
+        self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), (1e-5, 1e-5)
+
+    def add_skip_trt_case(self):
+        def teller1(program_config, predictor_config):
+            if program_config.ops[0].attrs[
+                    'padding_algorithm'] == "SAME" or program_config.ops[
+                        0].attrs['padding_algorithm'] == "VALID":
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "When padding_algorithm is 'SAME' or 'VALID', Trt dose not support. In this case, trt build error is caused by scale op."
+        )
+
+        def teller2(program_config, predictor_config):
+            if program_config.ops[0].attrs['dilations'][
+                    0] != 1 or program_config.ops[0].attrs['dilations'][1] != 1:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller2, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "When dilations's element is not equal 1, there are different behaviors between Trt and Paddle."
+        )
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+    def test_quant(self):
+        self.add_skip_trt_case()
+        self.run_test(quant=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py
new file mode 100644
index 0000000000000..3e923b1bd89d6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+
+
+class TrtConvertPool2dTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        self.trt_param.workspace_size = 1073741824
+
+        def generate_input1(attrs: List[Dict[str, Any]]):
+            return np.ones([1, 3, 64, 64]).astype(np.float32)
+
+        def generate_weight1(attrs: List[Dict[str, Any]]):
+            return np.random.random([24, 3, 3, 3]).astype(np.float32)
+
+        for strides in [[1, 1], [2, 2], [1, 2]]:
+            for paddings in [[0, 2], [0, 3], [1, 2, 3, 4]]:
+                for pooling_type in ['max', 'avg']:
+                    for padding_algotithm in ['EXPLICIT', 'SAME', 'VAILD']:
+                        for ksize in [[2, 3], [3, 3]]:
+                            for data_format in ['NCHW']:
+                                for global_pooling in [True, False]:
+                                    for exclusive in [True, False]:
+                                        for adaptive in [True, False]:
+                                            for ceil_mode in [True, False]:
+                                                self.paddings = paddings
+
+                                                dics = [{
+                                                    "pooling_type":
+                                                    pooling_type,
+                                                    "ksize": ksize,
+                                                    "data_fromat": data_format,
+                                                    "padding_algorithm":
+                                                    padding_algotithm,
+                                                    "paddings": paddings,
+                                                    "strides": strides,
+                                                    "data_format": data_format,
+                                                    "global_pooling":
+                                                    global_pooling,
+                                                    "exclusive": exclusive,
+                                                    "adaptive": adaptive,
+                                                    "ceil_mode": ceil_mode
+                                                }]
+
+                                                ops_config = [{
+                                                    "op_type": "pool2d",
+                                                    "op_inputs": {
+                                                        "X": ["input_data"],
+                                                    },
+                                                    "op_outputs": {
+                                                        "Out": ["output_data"]
+                                                    },
+                                                    "op_attrs": dics[0]
+                                                }]
+                                                ops = self.generate_op_config(
+                                                    ops_config)
+
+                                                program_config = ProgramConfig(
+                                                    ops=ops,
+                                                    weights={},
+                                                    inputs={
+                                                        "input_data":
+                                                        TensorConfig(
+                                                            data_gen=partial(
+                                                                generate_input1,
+                                                                dics))
+                                                    },
+                                                    outputs=["output_data"])
+
+                                                yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
+            self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
+            self.dynamic_shape.opt_input_shape = {"input_data": [1, 3, 64, 64]}
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            if self.paddings == [0, 3] or attrs[0][
+                    'global_pooling'] == True or attrs[0]['ceil_mode'] == True:
+                return 0, 3
+            return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+
+    def add_skip_trt_case(self):
+        def teller1(program_config, predictor_config):
+            if len(program_config.ops[0].attrs['paddings']) == 4:
+                return True
+            return False
+
+        self.add_skip_case(teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
+                           "4-dims paddings are not support for trt now.")
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()

From 1691dc7a9c0a3e861a16e58c5508e3e7233be27d Mon Sep 17 00:00:00 2001
From: ShenLiang <1422485404@qq.com>
Date: Fri, 24 Sep 2021 16:43:53 +0800
Subject: [PATCH 5/7] add update (#36017)

---
 .../paddle/distributed/fleet/meta_parallel/pipeline_parallel.py  | 1 +
 python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
index 8fad0686dd42e..431bc6d7bc389 100755
--- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
@@ -329,6 +329,7 @@ def _broadcast_final_loss(self):
     def _optimizer_step(self):
         if self.scaler:
             self.scaler.step(self.optimizer)
+            self.scaler.update()
         else:
             self.optimizer.step()
 
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py
index 083ad319305f3..4c966585d5f1f 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py
@@ -48,6 +48,7 @@ def train_batch(self, batch, model, optimizer, is_mp):
         scaled.backward()  # do backward
 
         scaler.step(optimizer)  # update parameters
+        scaler.update()
         optimizer.clear_grad()
         return scaled
 

From 787273eda531ae0cd651532eb39850a3614bfd67 Mon Sep 17 00:00:00 2001
From: piotrekobiIntel <piotr.paturej@intel.com>
Date: Fri, 24 Sep 2021 11:05:03 +0200
Subject: [PATCH 6/7] Added elementwise_sub_mkldnn operator (#35662)

* Add elementwise_sub_mkldnn_op without grad

* Add test to static_mode_white_list

* Refactor code, change license years

* Remove invalid grad implementation

* Fix element_wise_sub_op test

* Fix CI Approval error

* Remove unnecessary EltwiseSubMKLDNNGradKernel class

* Fix CI Approval 2

* Fix CI Approval 3

* Fix CI Approval Attempt #4

* Fix CI Approve Attempt #5

* Fix CI Approval Attempt #6

* Fix CI Approval Attemt #7

* Change test names containing add to sub

* Fix old tests testing add instead of sub

* Copy grad implementation from elementwise_add_mkldnn

* CI test fix attempt

* Revert "CI test fix attempt"

This reverts commit c647cacf41e6a87c715385a185de5cbf65fc8900.

* Fix CI attempt 2

* Fix elementwise_sub tests, temporary mkldnn broadcast test disable

* Add working implementation of elementwise_sub grad

* Fix build errors caused by pull

* Fix format error

* Fix format error 2

* Disable elementwise_sub_mkldnn test on GPU

* Apply fix for paddle.fluid import

* Revert changes of test_elementwise_sub and Fix mkldnn test

* Revert "Apply fix for paddle.fluid import"

This reverts commit fc3b122fec8e12f2bcb32928a2685ba4d20fd742.

* fix bug of module 'paddle' has no attribute 'fluid' for python3.6 (#35862)

* Add changes suggested by reviewers

* Change @unittest.skipIf... to @OpTestTool.skip_if_not_cpu_bf16() to satisfy Approval CI

* Remove check_dygraph=False to satisify CI Approval

Co-authored-by: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
---
 .../mkldnn/elementwise_sub_mkldnn_op.cc       | 132 ++++++++++
 paddle/fluid/platform/mkldnn_reuse.h          |  16 +-
 .../mkldnn/test_elementwise_sub_mkldnn_op.py  | 236 ++++++++++++++++++
 tools/static_mode_white_list.py               |   1 +
 4 files changed, 380 insertions(+), 5 deletions(-)
 create mode 100644 paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc
 create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py

diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc
new file mode 100644
index 0000000000000..be8dad62c3c05
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc
@@ -0,0 +1,132 @@
+
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
+namespace paddle {
+namespace framework {
+class ExecutionContext;
+}  // namespace framework
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+template <typename T>
+class EltwiseSubMKLDNNGradKernel : public ElemwiseGradKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
+    using Tensor = framework::Tensor;
+
+    auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& onednn_engine = dev_ctx.GetEngine();
+
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto tz = framework::vectorize<int64_t>(dout->dims());
+    memory::data_type dout_type = framework::ToMKLDNNDataType(dout->type());
+    platform::ReorderMKLDNNHandler handler(tz, dout->type(), dout_type,
+                                           onednn_engine);
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+    auto reorder_src_memory_p = handler.AcquireSrcMemory(
+        dout->format(), platform::to_void_cast(dout->data<T>()));
+
+    if (dx) {
+      auto reorder_dst_memory_p =
+          handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace());
+      auto reorder_p =
+          handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
+      platform::RecordEvent record_reorder("int_reorder",
+                                           platform::EventRole::kUniqueOp);
+
+      reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+      astream.wait();
+
+      dx->set_layout(DataLayout::kMKLDNN);
+      dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
+    }
+
+    if (dy) {
+      // Direct copy
+      if (dout->dims() == dy->dims()) {
+        auto reorder_dst_memory_p =
+            handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace());
+
+        dnnl::primitive_attr reorder_attr;
+        std::vector<float> scales = {-1};
+        reorder_attr.set_output_scales(0, scales);
+        auto reorder_p = std::make_shared<dnnl::reorder>(
+            *(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr);
+        platform::RecordEvent record_reorder("int_reorder",
+                                             platform::EventRole::kUniqueOp);
+        reorder_p->execute(astream, *reorder_src_memory_p,
+                           *reorder_dst_memory_p);
+        astream.wait();
+
+        dy->set_layout(DataLayout::kMKLDNN);
+        dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
+      } else {
+        // Broadcasting
+
+        dnnl::post_ops po;
+        po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, -1.0f, 0);
+        dnnl::primitive_attr attr;
+        attr.set_post_ops(po);
+
+        platform::ReductionMKLDNNHandler<T> handler_sum(
+            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine,
+            ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy), attr);
+
+        auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
+        auto reduction_p = handler_sum.AcquireForwardPrimitive();
+
+        reduction_p->execute(astream, {
+                                          {DNNL_ARG_SRC, *reorder_src_memory_p},
+                                          {DNNL_ARG_DST, *dy_memory_p},
+                                      });
+        astream.wait();
+
+        dy->set_layout(DataLayout::kMKLDNN);
+        dy->set_format(
+            platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
+                paddle::framework::vectorize<int64_t>(dy->dims()))));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(
+    elementwise_sub, MKLDNN, paddle::platform::CPUPlace,
+    ops::EltwiseMKLDNNKernel<float, dnnl::algorithm::binary_sub>,
+    ops::EltwiseMKLDNNKernel<paddle::platform::bfloat16,
+                             dnnl::algorithm::binary_sub>,
+    ops::EltwiseMKLDNNKernel<int8_t, dnnl::algorithm::binary_sub>,
+    ops::EltwiseMKLDNNKernel<uint8_t, dnnl::algorithm::binary_sub>)
+
+REGISTER_OP_KERNEL(elementwise_sub_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::EltwiseSubMKLDNNGradKernel<paddle::platform::bfloat16>,
+                   ops::EltwiseSubMKLDNNGradKernel<float>)
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index d6ab9e50a066e..1aa8c0cdb57f9 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "boost/optional.hpp"
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/operator.h"
@@ -927,7 +928,6 @@ class BroadcastDataMKLDNNHandler
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output) {
     T_out* ptr = output->mutable_data<T_out>(
         this->place_, this->fwd_pd_->dst_desc().get_size());
-    ;
     memset(ptr, 0, this->fwd_pd_->dst_desc().get_size());
     return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr);
   }
@@ -940,7 +940,8 @@ class ReductionMKLDNNHandler
   ReductionMKLDNNHandler(const dnnl::algorithm algo, const float p,
                          const float eps, const mkldnn::engine engine,
                          platform::Place cpu_place, const Tensor* x,
-                         const Tensor* y, std::vector<int64_t> y_tz)
+                         const Tensor* y, std::vector<int64_t> y_tz,
+                         const dnnl::primitive_attr& attr = NULL)
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::reduction>(engine,
                                                               cpu_place) {
     PADDLE_ENFORCE_EQ(
@@ -957,7 +958,10 @@ class ReductionMKLDNNHandler
     const auto y_md =
         memory::desc(y_tz, platform::MKLDNNGetDataType<T>(), x->format());
 
-    this->AcquireForwardPrimitiveDescriptor(algo, x_md, y_md, p, eps);
+    if (attr)
+      this->AcquireForwardPrimitiveDescriptor(attr, algo, x_md, y_md, p, eps);
+    else
+      this->AcquireForwardPrimitiveDescriptor(algo, x_md, y_md, p, eps);
   }
 };
 
@@ -979,8 +983,9 @@ class ActivationMKLDNNHandler
     if (ctx.Type() == "scale") {
       bool bias_after_scale = ctx.Attr<bool>("bias_after_scale");
       auto* scale_tensor = ctx.Input<Tensor>("ScaleTensor");
-      alpha = (scale_tensor == nullptr) ? ctx.Attr<float>("scale")
-                                        : (float)*(scale_tensor->data<T>());
+      alpha = (scale_tensor == nullptr)
+                  ? ctx.Attr<float>("scale")
+                  : static_cast<float>(*(scale_tensor->data<T>()));
       beta = ctx.Attr<float>("bias");
       // if bias_after_scale == true
       //   out = scale*X + bias
@@ -1504,6 +1509,7 @@ static void SetDstMemoryQuantized(
   T* output_data = output->mutable_data<T>(ctx.GetPlace());
   const size_t dst_dims = dst_tz.size();
   MKLDNNMemoryFormat dst_fmt;
+
   PADDLE_ENFORCE_LE(dst_dims, 5, platform::errors::InvalidArgument(
                                      "Dst memory for quantization can not have "
                                      "dims > 5. But received dst_dims is %d.",
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py
new file mode 100644
index 0000000000000..62c8c9571b793
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py
@@ -0,0 +1,236 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+from paddle import enable_static
+from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16
+from paddle.fluid.framework import _current_expected_place
+import paddle.fluid.core as core
+
+
+@OpTestTool.skip_if(not (isinstance(_current_expected_place(), core.CPUPlace)),
+                    "GPU is not supported")
+class TestMKLDNNElementwiseSubOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.out}
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.subtract(self.x, self.y)
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+    def test_check_grad_ignore_x(self):
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
+
+    def test_check_grad_ignore_y(self):
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
+
+    def init_axis(self):
+        self.axis = -1
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestMKLDNNElementwiseSubOp2(TestMKLDNNElementwiseSubOp):
+    def init_input_output(self):
+        self.x = np.random.random((100, )).astype(self.dtype)
+        self.y = np.random.random((100, )).astype(self.dtype)
+        self.out = np.subtract(self.x, self.y)
+
+
+class TestMKLDNNElementwiseSubOp3(TestMKLDNNElementwiseSubOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
+        self.out = np.subtract(self.x, self.y)
+
+
+class TestMKLDNNElementwiseSubOp4(TestMKLDNNElementwiseSubOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(1, 2, [2, 3, 4, 32]).astype(self.dtype)
+        self.y = np.random.uniform(1, 2, [4, 32]).astype(self.dtype)
+        self.out = np.subtract(self.x, self.y)
+
+
+class TestMKLDNNElementwiseSubOp5(TestMKLDNNElementwiseSubOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
+        self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
+        self.out = np.subtract(self.x, self.y)
+
+
+class TestMKLDNNElementwiseSubOp_broadcast(TestMKLDNNElementwiseSubOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
+        self.y = np.random.rand(10, 12).astype(self.dtype)
+        self.out = self.x - self.y.reshape(1, 10, 12, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseSubOp_xsize_lessthan_ysize_sub(TestMKLDNNElementwiseSubOp):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 12).astype(self.dtype)
+        self.y = np.random.rand(2, 2, 10, 12).astype(self.dtype)
+        self.out = self.x - self.y
+
+    def init_axis(self):
+        self.axis = 2
+
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ignore_y(self):
+        pass
+
+    def test_check_grad_ignore_x(self):
+        pass
+
+
+@OpTestTool.skip_if_not_cpu_bf16()
+class TestBf16(TestMKLDNNElementwiseSubOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.x_bf16 = convert_float_to_uint16(self.x)
+        self.y_bf16 = convert_float_to_uint16(self.y)
+        self.inputs = {'X': self.x_bf16, 'Y': self.y_bf16}
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': convert_float_to_uint16(self.out)}
+
+    def init_dtype(self):
+        self.dtype = np.float32
+        self.mkldnn_data_type = "bfloat16"
+
+    def init_input_output(self):
+        self.x = np.random.random(100, ).astype(self.dtype)
+        self.y = np.random.random(100, ).astype(self.dtype)
+        self.out = np.subtract(self.x, self.y)
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X", "Y"],
+            "Out",
+            user_defined_grads=[self.x, -self.x],
+            user_defined_grad_outputs=[self.x_bf16])
+
+    def test_check_grad_ignore_x(self):
+        self.check_grad_with_place(
+            core.CPUPlace(), ["Y"],
+            "Out",
+            user_defined_grads=[-self.y],
+            user_defined_grad_outputs=[self.y_bf16])
+
+    def test_check_grad_ignore_y(self):
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X"],
+            "Out",
+            user_defined_grads=[self.x],
+            user_defined_grad_outputs=[self.x_bf16])
+
+
+class TestBf16Broadcasting(TestBf16):
+    def init_input_output(self):
+        self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
+        self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
+        self.out = np.subtract(self.x, self.y)
+
+    def compute_reduced_gradients(self, out_grads):
+        part_sum = np.add.reduceat(out_grads, [0], axis=0)
+        part_sum = np.add.reduceat(part_sum, [0], axis=1)
+        part_sum = np.add.reduceat(part_sum, [0], axis=2)
+        return -part_sum.flatten()
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X", "Y"],
+            "Out",
+            user_defined_grads=[
+                self.x, self.compute_reduced_gradients(self.x)
+            ],
+            user_defined_grad_outputs=[self.x_bf16])
+
+    def test_check_grad_ignore_x(self):
+        self.check_grad_with_place(
+            core.CPUPlace(), ["Y"],
+            "Out",
+            user_defined_grads=[self.compute_reduced_gradients(self.x)],
+            user_defined_grad_outputs=[self.x_bf16])
+
+
+class TestInt8(TestMKLDNNElementwiseSubOp):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self._cpu_only = True
+
+    def init_dtype(self):
+        self.dtype = np.int8
+
+    def init_input_output(self):
+        self.x = np.random.randint(0, 3, (12, 9)).astype("int8")
+        self.y = np.random.randint(0, 3, (12, 9)).astype("int8")
+        self.out = np.subtract(self.x, self.y)
+
+    def init_scales(self):
+        self.attrs['Scale_x'] = 1.0
+        self.attrs['Scale_y'] = 1.0
+        self.attrs['Scale_out'] = 1.0
+
+    def test_check_output(self):
+        self.init_scales()
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ignore_x(self):
+        pass
+
+    def test_check_grad_ignore_y(self):
+        pass
+
+
+if __name__ == '__main__':
+    enable_static()
+    unittest.main()
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 43281d4375ed0..7d0a2a8953fc8 100644
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -610,6 +610,7 @@
     'test_dequantize_mkldnn_op',
     'test_elementwise_add_mkldnn_op',
     'test_elementwise_add_bf16_mkldnn_op',
+    'test_elementwise_sub_mkldnn_op',
     'test_elementwise_mul_mkldnn_op',
     'test_elementwise_mul_bf16_mkldnn_op',
     'test_fc_mkldnn_op',

From b91e8eec405ea6440124149cf6d21d559c7fded1 Mon Sep 17 00:00:00 2001
From: jiangcheng <thisjiang@qq.com>
Date: Fri, 24 Sep 2021 17:11:00 +0800
Subject: [PATCH 7/7] add gradient kernel of det op and slogdet op (#36013)

* add gradient kernel of det op and slogdet op

* fix CI APPROVAL problem
---
 paddle/fluid/operators/determinant_op.cc      |  11 +-
 paddle/fluid/operators/determinant_op.cu      |  36 ---
 paddle/fluid/operators/determinant_op.h       | 262 ++++++++++++++++--
 .../tests/unittests/test_determinant_op.py    |  32 +--
 4 files changed, 266 insertions(+), 75 deletions(-)

diff --git a/paddle/fluid/operators/determinant_op.cc b/paddle/fluid/operators/determinant_op.cc
index 379a401cde62e..98247fbc862bb 100644
--- a/paddle/fluid/operators/determinant_op.cc
+++ b/paddle/fluid/operators/determinant_op.cc
@@ -48,6 +48,8 @@ class DeterminantGradOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input",
                    "DeterminantGradOp");
     OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "DeterminantGradOp");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   framework::GradVarName("Out"), "DeterminantGradOp");
     OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Input")), "Output",
                    framework::GradVarName("Input"), "DeterminantGradOp");
 
@@ -117,7 +119,8 @@ class SlogDeterminantGradOp : public framework::OperatorWithKernel {
                    "SlogDeterminantGradOp");
     OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out",
                    "SlogDeterminantGradOp");
-
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   framework::GradVarName("Out"), "SlogDeterminantGradOp");
     OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Input")), "Output",
                    framework::GradVarName("Input"), "SlogDeterminantGradOp");
 
@@ -179,7 +182,7 @@ REGISTER_OPERATOR(slogdeterminant, ops::SlogDeterminantOp,
                   ops::SlogDeterminantGradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(slogdeterminant_grad,
-                  ops::DeterminantGradOp)  // reuse det grad op
+                  ops::SlogDeterminantGradOp)  // reuse det grad op
 
 REGISTER_OP_CPU_KERNEL(
     slogdeterminant, ops::SlogDeterminantKernel<plat::CPUDeviceContext, float>,
@@ -187,5 +190,5 @@ REGISTER_OP_CPU_KERNEL(
 
 REGISTER_OP_CPU_KERNEL(
     slogdeterminant_grad,
-    ops::DeterminantGradKernel<plat::CPUDeviceContext, float>,
-    ops::DeterminantGradKernel<plat::CPUDeviceContext, double>);
+    ops::SlogDeterminantGradKernel<plat::CPUDeviceContext, float>,
+    ops::SlogDeterminantGradKernel<plat::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/determinant_op.cu b/paddle/fluid/operators/determinant_op.cu
index f17d94d805228..d19d4c3d09386 100644
--- a/paddle/fluid/operators/determinant_op.cu
+++ b/paddle/fluid/operators/determinant_op.cu
@@ -14,42 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/determinant_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-using Tensor = framework::Tensor;
-
-template <typename T>
-__global__ void DeterminantGrad(const size_t numel, T* out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  if (tid < numel) {
-    out[tid] = static_cast<T>(1);
-  }
-}
-
-template <typename T>
-class DeterminantGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
-    const T* dout_data = dout->data<T>();
-    auto dout_dim = vectorize(dout->dims());
-
-    auto* dx = context.Output<Tensor>(framework::GradVarName("Input"));
-    T* dx_data = dx->mutable_data<T>(context.GetPlace());
-
-    int64_t numel = dx->numel();
-    for (int64_t idx = 0; idx < numel; idx++) {
-      dx_data[idx] = static_cast<T>(1);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h
index ead1262d9fe06..4c17869fb5d2a 100644
--- a/paddle/fluid/operators/determinant_op.h
+++ b/paddle/fluid/operators/determinant_op.h
@@ -19,7 +19,11 @@
 #include <cmath>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/complex_functors.h"
+#include "paddle/fluid/operators/math/matrix_inverse.h"
+#include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/for_range.h"
 
 namespace paddle {
 namespace operators {
@@ -48,11 +52,10 @@ class EigenMatrix<double> {
 inline int64_t GetBatchCount(const framework::DDim dims) {
   int64_t batch_count = 1;
   auto dim_size = dims.size();
-  PADDLE_ENFORCE_GT(dim_size, 2,
-                    platform::errors::InvalidArgument(
-                        "To get the number of batch square matrices, "
-                        "the size of dimension should greater than 2.",
-                        dim_size));
+  PADDLE_ENFORCE_GE(
+      dim_size, 2,
+      platform::errors::InvalidArgument(
+          "the input matrix dimension size should greater than 2."));
 
   // Cumulative multiplying each dimension until the last 2 to get the batch
   // count,
@@ -77,7 +80,7 @@ struct DeterminantFunctor {
       auto end_iter = input_vec.begin() + (i + 1) * rank * rank;
       std::vector<T> sub_vec(begin_iter,
                              end_iter);  // get every square matrix data
-      Eigen::MatrixXf matrix(rank, rank);
+      typename EigenMatrix<T>::MatrixType matrix(rank, rank);
       for (int64_t i = 0; i < rank; ++i) {
         for (int64_t j = 0; j < rank; ++j) {
           matrix(i, j) = sub_vec[rank * i + j];
@@ -109,41 +112,169 @@ class DeterminantKernel : public framework::OpKernel<T> {
                           "the input matrix should be square matrix."));
     auto rank = input_dim[input_dim_size - 1];  // square matrix length
     DeterminantFunctor<T>()(*input, context, rank, batch_count, output);
+    auto output_dims =
+        framework::slice_ddim(input->dims(), 0, input_dim_size - 2);
     if (input_dim_size > 2) {
-      auto output_dims =
-          framework::slice_ddim(input->dims(), 0, input_dim_size - 2);
       output->Resize(output_dims);
+    } else {
+      // when input is a two-dimension matrix, The det value is a number.
+      output->Resize({1});
     }
     VLOG(2) << "output dim:" << output->dims();
   }
 };
 
+template <typename T>
+struct FoundZeroFunctor {
+  FoundZeroFunctor(const T* x, int64_t numel, bool* res)
+      : x_(x), numel_(numel), res_(res) {}
+  HOSTDEVICE void operator()(size_t idx) const {
+    if (*res_ || idx >= static_cast<size_t>(numel_)) {
+      // founded zero number
+      return;
+    }
+    *res_ = (x_[idx] == static_cast<T>(0));
+  }
+  const T* x_;
+  int64_t numel_;
+  bool* res_;
+};
+
+template <typename DeviceContext, typename T>
+inline bool CheckMatrixInvertible(const framework::ExecutionContext& ctx,
+                                  const framework::Tensor* det) {
+  auto& dev_ctx = ctx.template device_context<DeviceContext>();
+  auto numel = det->numel();
+
+  framework::Tensor dev_tensor;
+  auto* data = dev_tensor.mutable_data<bool>({1}, ctx.GetPlace());
+
+  // set false
+  math::SetConstant<DeviceContext, bool> zero;
+  zero(dev_ctx, &dev_tensor, false);
+
+  // find whether zero
+  platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
+  FoundZeroFunctor<T> functor(det->data<T>(), numel, data);
+  for_range(functor);
+
+  // copy to host
+  dev_ctx.Wait();
+  framework::Tensor cpu_tensor;
+  framework::TensorCopy(dev_tensor, platform::CPUPlace(), &cpu_tensor);
+
+  // if founded zero, the matrix is not invertible
+  // else the matrix is invertible
+  auto* res = cpu_tensor.data<bool>();
+  return !(*res);
+}
+
 template <typename DeviceContext, typename T>
 class DeterminantGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Not support DeterminantGrad at this time."));
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    const auto* input = context.Input<framework::Tensor>("Input");
+    const auto* det = context.Input<framework::Tensor>("Out");
+    const auto* grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* ddet =
+        context.Output<framework::Tensor>(framework::GradVarName("Input"));
+
+    auto input_dims_size = input->dims().size();
+    if (input_dims_size > 2) {
+      PADDLE_ENFORCE_EQ(
+          grad->dims().size() + 2, input_dims_size,
+          platform::errors::InvalidArgument(
+              "The grad tensor of det dims size should 2 less than"
+              " input tensor's, but here differ %d",
+              input_dims_size - grad->dims().size()));
+    } else if (input_dims_size == 2) {
+      // input dims size 2 and grad dims size 1 is possible
+      PADDLE_ENFORCE_EQ(
+          grad->dims().size(), 1,
+          platform::errors::InvalidArgument(
+              "The grad tensor of det dims size should 2 less than"
+              " input tensor's, but here differ %d",
+              input_dims_size - grad->dims().size()));
+    } else {
+      // checked in forward, pass
+    }
+
+    // Check Whether the matrix is invertible
+    // (matrix A not invertible) == (det(A)=0)
+    if (!CheckMatrixInvertible<DeviceContext, T>(context, det)) {
+      // The matrix is not invertible
+      VLOG(3) << "The input matrix not invertible!";
+      ddet->Resize(input->dims());
+      ddet->mutable_data<T>(context.GetPlace());
+      math::SetConstant<DeviceContext, T> zero;
+      zero(dev_ctx, ddet, static_cast<T>(0.0f));
+      return;
+    }
+
+    // The matrix is invertible
+    // let |A| = Determinant(A)
+    // Ref to https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf
+    // we set d|A| = unsqueeze(dA * |A|, [-1, -2]) * inverse(A).transpose(-2,
+    // -1)
+
+    math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(context);
+
+    // First: inverse(A)
+    framework::Tensor inverse_A;
+    // A must be square matrices!
+    inverse_A.Resize(input->dims());
+    inverse_A.mutable_data<T>(context.GetPlace());
+
+    math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
+    mat_inv(dev_ctx, *input, &inverse_A);
+
+    VLOG(3) << "inverse(A) dims: " << inverse_A.dims();
+
+    // Second: inverse(A).transpose(-2, -1)
+    framework::Tensor transpose_inverse_A = helper.Transpose(inverse_A);
+    VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: "
+            << transpose_inverse_A.dims();
+
+    // Third: dA * |A|
+    auto mul_dA_detA = helper.Mul(*grad, *det);
+    VLOG(3) << "dA * |A| dims: " << mul_dA_detA.dims();
+
+    // Fourth: unsqueeze(dA * |A|, [-1, -2])
+    auto unsqueeze1 = helper.Unsqueeze(mul_dA_detA, -1);
+    auto unsqueeze2 = helper.Unsqueeze(unsqueeze1, -2);
+    VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims();
+
+    // Finally: unsqueeze(dA * |A|) * inverse(A)
+    auto res = helper.Mul(unsqueeze2, transpose_inverse_A);
+
+    VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims();
+
+    framework::TensorCopy(res, context.GetPlace(), ddet);
+
+    ddet->Resize(input->dims());
+    VLOG(3) << "d|A| dims: " << ddet->dims();
   }
 };
 
 template <typename T>
 struct SlogDeterminantFunctor {
   void operator()(const Tensor& input, const framework::ExecutionContext ctx,
-                  int rank, int batch_count, Tensor* output) {
+                  int64_t rank, int64_t batch_count, Tensor* output) {
     std::vector<T> input_vec;
     std::vector<T> sign_vec;
     std::vector<T> log_vec;
     std::vector<T> output_vec;
     framework::TensorToVector(input, ctx.device_context(), &input_vec);
-    for (int i = 0; i < batch_count; ++i) {  // maybe can be parallel
+    for (int64_t i = 0; i < batch_count; ++i) {  // maybe can be parallel
       auto begin_iter = input_vec.begin() + i * rank * rank;
       auto end_iter = input_vec.begin() + (i + 1) * rank * rank;
       std::vector<T> sub_vec(begin_iter,
                              end_iter);  // get every square matrix data
       typename EigenMatrix<T>::MatrixType matrix(rank, rank);
-      for (int i = 0; i < rank; ++i) {
-        for (int j = 0; j < rank; ++j) {
+      for (int64_t i = 0; i < rank; ++i) {
+        for (int64_t j = 0; j < rank; ++j) {
           matrix(i, j) = sub_vec[rank * i + j];
         }
       }
@@ -185,6 +316,10 @@ class SlogDeterminantKernel : public framework::OpKernel<T> {
     auto rank = input_dim[input_dim_size - 1];  // square matrix length
     SlogDeterminantFunctor<T>()(*input, context, rank, batch_count, output);
     std::vector<int> output_dim_vec(input_dim.begin(), input_dim.end() - 2);
+    if (input_dim.size() == static_cast<size_t>(2)) {
+      // when input is a two-dimension matrix, The det value is a number.
+      output_dim_vec = {1};
+    }
     output_dim_vec.insert(output_dim_vec.begin(),
                           2);  // make the output dims as same as numpy
     auto output_dims = framework::make_ddim(output_dim_vec);
@@ -197,8 +332,103 @@ template <typename DeviceContext, typename T>
 class SlogDeterminantGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Not support SlogDeterminantGrad at this time."));
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    const auto* input = context.Input<framework::Tensor>("Input");
+    const auto* slogdet = context.Input<framework::Tensor>("Out");
+    const auto* grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dslogdet =
+        context.Output<framework::Tensor>(framework::GradVarName("Input"));
+
+    PADDLE_ENFORCE_EQ(grad->dims()[0], 2,
+                      platform::errors::InvalidArgument(
+                          "The grad tensor of SlogDet should contain two"
+                          " grad: sign and absslogdet, but here %ld.",
+                          grad->dims()[0]));
+    if (input->dims().size() > 2) {
+      PADDLE_ENFORCE_EQ(
+          grad->dims().size() + 1, input->dims().size(),
+          platform::errors::InvalidArgument(
+              "The grad tensor of slogdet dims size should 1 less than"
+              " input tensor's, but here differ %d",
+              input->dims().size() - grad->dims().size()));
+    }
+
+    // Check Whether the matrix is invertible
+    // (matrix A not invertible) == (absslogdet(A)=0)
+    auto slogdet_vec = slogdet->Split(1, 0);
+    auto absslogdet_val = slogdet_vec[0];
+    if (!CheckMatrixInvertible<DeviceContext, T>(context, &absslogdet_val)) {
+      // The matrix is not invertible
+      VLOG(3) << "The input matrix not invertible!";
+      dslogdet->Resize(input->dims());
+      dslogdet->mutable_data<T>(context.GetPlace());
+      math::SetConstant<DeviceContext, T> zero;
+      zero(dev_ctx, dslogdet, std::numeric_limits<T>::quiet_NaN());
+      return;
+    }
+
+    // The matrix is invertible
+    // let sl|A| = SlogDeterminant(A)
+    // Ref to https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf
+    // we set dsl|A| = unsqueeze(dslA, [-1, -2]) *
+    // inverse(A).conj().transpose(-2, -1)
+
+    math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(context);
+
+    // First: inverse(A)
+    framework::Tensor inverse_A;
+    // A must be square matrices!
+    inverse_A.Resize(input->dims());
+    inverse_A.mutable_data<T>(context.GetPlace());
+
+    math::MatrixInverseFunctor<DeviceContext, T> mat_inv;
+    mat_inv(dev_ctx, *input, &inverse_A);
+
+    VLOG(3) << "inverse(A) dims: " << inverse_A.dims();
+
+    // Second: inverse(A).conj()
+    framework::Tensor conj_inverse_A;
+    conj_inverse_A.Resize(inverse_A.dims());
+    auto numel = input->numel();
+    auto* conj_data = conj_inverse_A.mutable_data<T>(context.GetPlace(),
+                                                     size_t(numel * sizeof(T)));
+
+    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
+    math::ConjFunctor<T> functor(inverse_A.data<T>(), numel, conj_data);
+    for_range(functor);
+
+    VLOG(3) << "inverse(A).conj() dims: " << conj_inverse_A.dims();
+
+    // Third: inverse(A).conj().transpose(-2, -1)
+    framework::Tensor transpose_inverse_A = helper.Transpose(conj_inverse_A);
+    VLOG(3) << "inverse(A).conj().transpose(-2, -1) dims: "
+            << transpose_inverse_A.dims();
+
+    // Fourth: split grad value to [sign_grad, absslogdet_grad]
+    auto grad_vec = grad->Split(1, 0);
+    auto det_grad = grad_vec[1];
+
+    // remmove useless first dimension
+    int det_grad_size = det_grad.dims().size();
+    std::vector<int> det_grad_vec;
+    for (int i = 1; i < det_grad_size; ++i) {
+      det_grad_vec.emplace_back(det_grad.dims()[i]);
+    }
+    det_grad.Resize(det_grad.dims().reshape(det_grad_vec));
+
+    // Fifth: unsqueeze(dslA, [-1, -2])
+    auto unsqueeze1 = helper.Unsqueeze(det_grad, -1);
+    auto unsqueeze2 = helper.Unsqueeze(unsqueeze1, -2);
+    VLOG(3) << "unsqueezed(dslA, [-1, -2]) dims: " << unsqueeze2.dims();
+
+    // Finally: unsqueeze(dslA) * inverse(A)
+    auto res = helper.Mul(unsqueeze2, transpose_inverse_A);
+    VLOG(3) << "unsqueeze(dslA) * inverse(A) dims: " << res.dims();
+
+    framework::TensorCopy(res, context.GetPlace(), dslogdet);
+    dslogdet->Resize(input->dims());
+    VLOG(3) << "dsl|A| dims: " << dslogdet->dims();
   }
 };
 
diff --git a/python/paddle/fluid/tests/unittests/test_determinant_op.py b/python/paddle/fluid/tests/unittests/test_determinant_op.py
index c19d44eb030cf..f8110bffa2f71 100644
--- a/python/paddle/fluid/tests/unittests/test_determinant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_determinant_op.py
@@ -16,7 +16,7 @@
 
 import unittest
 import numpy as np
-from op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest
 import paddle
 import paddle.nn.functional as F
 import paddle.fluid as fluid
@@ -26,7 +26,6 @@
 paddle.enable_static()
 
 
-@skip_check_grad_ci(reason="determinant grad is in progress.")
 class TestDeterminantOp(OpTest):
     def setUp(self):
         self.init_data()
@@ -37,11 +36,11 @@ def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        pass
+        self.check_grad(['Input'], ['Out'])
 
     def init_data(self):
         np.random.seed(0)
-        self.case = np.random.rand(3, 3, 3, 3, 3).astype('float64')
+        self.case = np.random.rand(3, 3, 3, 5, 5).astype('float64')
         self.inputs = {'Input': self.case}
         self.target = np.linalg.det(self.case)
 
@@ -49,30 +48,25 @@ def init_data(self):
 class TestDeterminantOpCase1(TestDeterminantOp):
     def init_data(self):
         np.random.seed(0)
-        self.case = np.random.rand(3, 3, 3, 3).astype(np.float32)
+        self.case = np.random.rand(10, 10).astype('float32')
         self.inputs = {'Input': self.case}
         self.target = np.linalg.det(self.case)
 
-    def test_check_grad(self):
-        pass
-
 
 class TestDeterminantOpCase2(TestDeterminantOp):
     def init_data(self):
         np.random.seed(0)
-        self.case = np.random.rand(4, 2, 4, 4).astype('float64')
+        # not invertible matrix
+        self.case = np.ones([4, 2, 4, 4]).astype('float64')
         self.inputs = {'Input': self.case}
         self.target = np.linalg.det(self.case)
 
-    def test_check_grad(self):
-        pass
-
 
 class TestDeterminantAPI(unittest.TestCase):
     def setUp(self):
-        self.shape = [3, 3, 3, 3]
         np.random.seed(0)
-        self.x = np.random.rand(3, 3, 3, 3).astype(np.float32)
+        self.shape = [3, 3, 5, 5]
+        self.x = np.random.random(self.shape).astype(np.float32)
         self.place = paddle.CPUPlace()
 
     def test_api_static(self):
@@ -96,7 +90,6 @@ def test_api_dygraph(self):
         paddle.enable_static()
 
 
-@skip_check_grad_ci(reason="slogdeterminant grad is in progress.")
 class TestSlogDeterminantOp(OpTest):
     def setUp(self):
         self.op_type = "slogdeterminant"
@@ -107,11 +100,12 @@ def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        pass
+        # the slog det's grad value is always huge
+        self.check_grad(['Input'], ['Out'], max_relative_error=0.1)
 
     def init_data(self):
         np.random.seed(0)
-        self.case = np.random.rand(3, 3, 3, 3).astype('float64')
+        self.case = np.random.rand(4, 5, 5).astype('float64')
         self.inputs = {'Input': self.case}
         self.target = np.array(np.linalg.slogdet(self.case))
 
@@ -126,9 +120,9 @@ def init_data(self):
 
 class TestSlogDeterminantAPI(unittest.TestCase):
     def setUp(self):
-        self.shape = [3, 3, 3, 3]
         np.random.seed(0)
-        self.x = np.random.rand(3, 3, 3, 3).astype(np.float32)
+        self.shape = [3, 3, 5, 5]
+        self.x = np.random.random(self.shape).astype(np.float32)
         self.place = paddle.CPUPlace()
 
     def test_api_static(self):