From 9eaf02981832ee771ad821d1eaae3702edccd6cf Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Tue, 28 Feb 2023 18:19:53 +0800
Subject: [PATCH 01/17] update codes about temporal_shift

---
 .../fluid/inference/api/analysis_predictor.cc |  1 +
 .../inference/tensorrt/convert/CMakeLists.txt |  3 +-
 .../tensorrt/convert/temporal_shift_op.cc     | 89 +++++++++++++++++++
 paddle/fluid/inference/tensorrt/op_teller.cc  |  6 +-
 4 files changed, 96 insertions(+), 3 deletions(-)
 create mode 100644 paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index e8888940a99ac9..437808696a8871 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2490,6 +2490,7 @@ USE_TRT_CONVERTER(mish);
 USE_TRT_CONVERTER(deformable_conv);
 USE_TRT_CONVERTER(pool3d)
 USE_TRT_CONVERTER(square);
+USE_TRT_CONVERTER(temporal_shift);
 // unary op
 USE_TRT_CONVERTER(exp);
 USE_TRT_CONVERTER(log);
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index de91a0493b6946..271563380f3746 100755
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -100,7 +100,8 @@ list(
   elementwiseadd_transpose_op.cc
   skip_groupnorm_act_op.cc
   preln_groupnorm_act_op.cc
-  expand_v2_op.cc)
+  expand_v2_op.cc
+  temporal_shift_op.cc)
 
 if(${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 7)
   list(APPEND CONVERT_FILES emb_eltwise_layernorm.cc
diff --git a/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc b/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
new file mode 100644
index 00000000000000..184136eccb512c
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * TemporalShiftOp.
+ */
+class TemporalShiftOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope,
+                  bool test_mode) override {
+    VLOG(3) << "convert a fluid transpose op to tensorrt tranpose layer";
+
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    const float shift_ratio = PADDLE_GET_CONST(float, op_desc.GetAttr("shift_ratio"));
+    const int T = PADDLE_GET_CONST(int, op_desc.GetAttr("seg_num"));
+
+    const auto& input_dims = input->getDimensions();
+    int NT = input_dims.d[0];
+    int C = input_dims.d[1];
+    int H = input_dims.d[2];
+    int W = input_dims.d[3];
+    int N = NT / T;
+
+    // Reshape input to [N,T,C,H,W]
+    auto reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+    reshape_layer->setReshapeDimensions(nvinfer1::Dims5(N, T, C, H, W));
+    input = reshape_layer->getOutput(0);
+
+    // Pad input
+    auto* pad_layer = TRT_ENGINE_ADD_LAYER(engine_, Padding, *input, nvinfer1::Dims4(0, 0, 1, 1), nvinfer1::Dims4(0, 0, 1, 1));
+    input = pad_layer->getOutput(0);
+
+    // Slice input
+    int slice_size = static_cast<int>(C * shift_ratio);
+    auto slice1_layer = TRT_ENGINE_ADD_LAYER(engine_, Slice, *input, nvinfer1::Dims3(0, 0, 0), nvinfer1::Dims3(N, T, slice_size), nvinfer1::Dims3(1, 1, 1));
+    auto slice2_layer = TRT_ENGINE_ADD_LAYER(engine_, Slice, *input, nvinfer1::Dims3(0, 2, slice_size), nvinfer1::Dims3(N, T, slice_size), nvinfer1::Dims3(1, 1, 1));
+    auto slice3_layer = TRT_ENGINE_ADD_LAYER(engine_, Slice, *input, nvinfer1::Dims3(0, 1, slice_size * 2), nvinfer1::Dims3(N, T, C - slice_size * 2), nvinfer1::Dims3(1, 1, 1));
+
+    // Concatenate slices along the third dimension (C)
+    auto concat_layer = TRT_ENGINE_ADD_LAYER(engine_, Concatenation, &slice1_layer->getOutput(0), 3);
+    concat_layer->setInput(1, slice2_layer->getOutput(0));
+    concat_layer->setInput(2, slice3_layer->getOutput(0));
+    concat_layer->setAxis(2);
+
+    // Reshape output to [N*T,C,H,W]
+    auto reshape_layer2 = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *concat_layer->getOutput(0));
+    reshape_layer2->setReshapeDimensions(nvinfer1::Dims4(NT, C, H, W));
+
+    // Set output
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(reshape_layer2, "temporal_shift", {output_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(temporal_shift, TemporalShiftOp);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 029665bd111315..cf67b97074a475 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -2701,7 +2701,8 @@ struct SimpleOpTypeSetTeller : public Teller {
       "expand_v2",
       "fuse_eleadd_transpose",
       "skip_groupnorm_act",
-      "preln_groupnorm_act"};
+      "preln_groupnorm_act",
+      "temporal_shift"};
 
   std::unordered_set<std::string> teller_set{
       "mul",
@@ -2853,7 +2854,8 @@ struct SimpleOpTypeSetTeller : public Teller {
       "expand_v2",
       "fuse_eleadd_transpose",
       "skip_groupnorm_act",
-      "preln_groupnorm_act"};
+      "preln_groupnorm_act",
+      "temporal_shift"};
 };
 
 struct GenericPluginTeller : public Teller {

From 62528288509705d61ad9f217e61cd278aef0e5af Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Tue, 28 Feb 2023 21:33:00 +0800
Subject: [PATCH 02/17] update codes about temporal_shift

---
 .../tensorrt/convert/temporal_shift_op.cc     |  50 ++++++--
 .../test_trt_convert_temporal_shift.py        | 117 ++++++++++++++++++
 2 files changed, 155 insertions(+), 12 deletions(-)
 create mode 100755 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py

diff --git a/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc b/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
index 184136eccb512c..562c7d4d6acca7 100644
--- a/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
@@ -53,32 +53,58 @@ class TemporalShiftOpConverter : public OpConverter {
 
     // Reshape input to [N,T,C,H,W]
     auto reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
-    reshape_layer->setReshapeDimensions(nvinfer1::Dims5(N, T, C, H, W));
+    nvinfer1::Dims reshape_dims{5, {N, T, C, H, W}};
+    reshape_layer->setReshapeDimensions(reshape_dims);
     input = reshape_layer->getOutput(0);
 
     // Pad input
-    auto* pad_layer = TRT_ENGINE_ADD_LAYER(engine_, Padding, *input, nvinfer1::Dims4(0, 0, 1, 1), nvinfer1::Dims4(0, 0, 1, 1));
+    auto* pad_layer = TRT_ENGINE_ADD_LAYER(engine_,
+                                           PaddingNd,
+                                           *input,
+                                           nvinfer1::Dims4(0, 1, 0, 0),
+                                           nvinfer1::Dims4(0, 1, 0, 0));
     input = pad_layer->getOutput(0);
 
     // Slice input
-    int slice_size = static_cast<int>(C * shift_ratio);
-    auto slice1_layer = TRT_ENGINE_ADD_LAYER(engine_, Slice, *input, nvinfer1::Dims3(0, 0, 0), nvinfer1::Dims3(N, T, slice_size), nvinfer1::Dims3(1, 1, 1));
-    auto slice2_layer = TRT_ENGINE_ADD_LAYER(engine_, Slice, *input, nvinfer1::Dims3(0, 2, slice_size), nvinfer1::Dims3(N, T, slice_size), nvinfer1::Dims3(1, 1, 1));
-    auto slice3_layer = TRT_ENGINE_ADD_LAYER(engine_, Slice, *input, nvinfer1::Dims3(0, 1, slice_size * 2), nvinfer1::Dims3(N, T, C - slice_size * 2), nvinfer1::Dims3(1, 1, 1));
+    int slice_c = int(C * shift_ratio);
+    int slice_c2 = int(C * shift_ratio * 2);
+    auto* slice1_layer = TRT_ENGINE_ADD_LAYER(engine_,
+                                              Slice,
+                                              *pad_layer->getOutput(0),
+                                              nvinfer1::Dims3{0, 0, 0},
+                                              nvinfer1::Dims3{T, slice_c, H},
+                                              nvinfer1::Dims3{1, 1, 1});
+    auto* slice2_layer = TRT_ENGINE_ADD_LAYER(engine_,
+                                              Slice,
+                                              *pad_layer->getOutput(0),
+                                              nvinfer1::Dims3{0, 2, 0},
+                                              nvinfer1::Dims3{T, slice_c2 - slice_c, H},
+                                              nvinfer1::Dims3{1, 1, 1});
+    auto* slice3_layer = TRT_ENGINE_ADD_LAYER(engine_,
+                                              Slice,
+                                              *pad_layer->getOutput(0),
+                                              nvinfer1::Dims3{0, 1, 0},
+                                              nvinfer1::Dims3{T, C - slice_c2, H},
+                                              nvinfer1::Dims3{1, 1, 1});
 
     // Concatenate slices along the third dimension (C)
-    auto concat_layer = TRT_ENGINE_ADD_LAYER(engine_, Concatenation, &slice1_layer->getOutput(0), 3);
-    concat_layer->setInput(1, slice2_layer->getOutput(0));
-    concat_layer->setInput(2, slice3_layer->getOutput(0));
+    nvinfer1::ITensor* concat_inputs[3] = {slice1_layer->getOutput(0),
+                                           slice2_layer->getOutput(0),
+                                           slice3_layer->getOutput(0)};
+    auto* concat_layer = TRT_ENGINE_ADD_LAYER(engine_,
+                                              Concatenation,
+                                              concat_inputs, 3);
     concat_layer->setAxis(2);
 
     // Reshape output to [N*T,C,H,W]
-    auto reshape_layer2 = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *concat_layer->getOutput(0));
-    reshape_layer2->setReshapeDimensions(nvinfer1::Dims4(NT, C, H, W));
+    nvinfer1::Dims output_shape{4, {N * T, C, H, W}};
+    auto* reshape_layer2 = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *concat_layer->getOutput(0));
+    reshape_layer2->setReshapeDimensions(output_shape);
 
     // Set output
     auto output_name = op_desc.Output("Out")[0];
     RreplenishLayerAndOutput(reshape_layer2, "temporal_shift", {output_name}, test_mode);
+
   }
 };
 
@@ -86,4 +112,4 @@ class TemporalShiftOpConverter : public OpConverter {
 }  // namespace inference
 }  // namespace paddle
 
-REGISTER_TRT_OP_CONVERTER(temporal_shift, TemporalShiftOp);
+REGISTER_TRT_OP_CONVERTER(temporal_shift, TemporalShiftOpConverter);
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
new file mode 100755
index 00000000000000..4a1132f203c31f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from functools import partial
+from typing import Any, Dict, List
+
+import numpy as np
+from program_config import ProgramConfig, TensorConfig
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest
+
+import paddle.inference as paddle_infer
+
+
+class TrtConvertTemporalShiftTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        def generate_input1(attrs: List[Dict[str, Any]]):
+            T = attrs[0]['seg_num']
+            NT = 3 * T
+            return np.random.random([NT, 4, 32, 32]).astype(np.float32)
+
+        for shift_value in [0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50]:
+            for T in range(2, 5):
+                dics = [{"shift_ratio": shift_value, "seg_num": T}, {}]
+
+                ops_config = [
+                    {
+                        "op_type": "temporal_shift",
+                        "op_inputs": {"X": ["input_data"]},
+                        "op_outputs": {"Out": ["temporal_shift_output_data"]},
+                        "op_attrs": dics[0],
+                    }
+                ]
+                ops = self.generate_op_config(ops_config)
+
+                program_config = ProgramConfig(
+                    ops=ops,
+                    weights={},
+                    inputs={
+                        "input_data": TensorConfig(
+                            data_gen=partial(generate_input1, dics)
+                        )
+                    },
+                    outputs=["temporal_shift_output_data"],
+                )
+
+                yield program_config
+
+    def sample_predictor_configs(
+            self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            T = attrs[0]['seg_num']
+            self.dynamic_shape.min_input_shape = {
+                "input_data": [1 * T, 3, 32, 32]
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input_data": [3 * T, 3, 64, 64]
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input_data": [1 * T, 3, 64, 64]
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), (1e-3, 1e-3)
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True
+        ), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True
+        ), (1e-3, 1e-3)
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()

From 0fd93b456c2397c36c266b9052f3351d98ecc3ec Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Wed, 1 Mar 2023 17:03:55 +0800
Subject: [PATCH 03/17] fix error about padding

---
 .../tensorrt/convert/temporal_shift_op.cc     | 116 ++++++++++++------
 .../test_trt_convert_temporal_shift.py        |  70 +++++------
 2 files changed, 109 insertions(+), 77 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc b/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
index 562c7d4d6acca7..b2562594965a99 100644
--- a/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
@@ -37,73 +37,117 @@ class TemporalShiftOpConverter : public OpConverter {
                   const framework::Scope& scope,
                   bool test_mode) override {
     VLOG(3) << "convert a fluid transpose op to tensorrt tranpose layer";
-
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+
     const float shift_ratio = PADDLE_GET_CONST(float, op_desc.GetAttr("shift_ratio"));
     const int T = PADDLE_GET_CONST(int, op_desc.GetAttr("seg_num"));
 
-    const auto& input_dims = input->getDimensions();
-    int NT = input_dims.d[0];
-    int C = input_dims.d[1];
-    int H = input_dims.d[2];
-    int W = input_dims.d[3];
-    int N = NT / T;
+    auto input_dims = input->getDimensions();
 
-    // Reshape input to [N,T,C,H,W]
+    const int NT = input_dims.d[0];
+    const int C = input_dims.d[1];
+    const int H = input_dims.d[2];
+    const int W = input_dims.d[3];
+    const int N = NT / T;
+
+    // Reshape input to [N,C,H,W,T]
     auto reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
-    nvinfer1::Dims reshape_dims{5, {N, T, C, H, W}};
+    nvinfer1::Dims reshape_dims{5, {N, C, H, W, T}};
     reshape_layer->setReshapeDimensions(reshape_dims);
-    input = reshape_layer->getOutput(0);
 
     // Pad input
     auto* pad_layer = TRT_ENGINE_ADD_LAYER(engine_,
-                                           PaddingNd,
-                                           *input,
-                                           nvinfer1::Dims4(0, 1, 0, 0),
-                                           nvinfer1::Dims4(0, 1, 0, 0));
-    input = pad_layer->getOutput(0);
+                                           Padding,
+                                           *reshape_layer->getOutput(0),
+                                           nvinfer1::DimsHW{0, 1},
+                                           nvinfer1::DimsHW{0, 1});
+
+    // Reshape input to [N,T,C,H,W]
+    auto reshape_layer2 = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *pad_layer->getOutput(0));
+    nvinfer1::Dims reshape_dims2{5, {N, T + 2, C, H, W}};
+    reshape_layer2->setReshapeDimensions(reshape_dims2);
+
+    // print pad_layer->getOutput(0)->getDimensions()
+    auto pad_dims = pad_layer->getOutput(0)->getDimensions();
+    int dims = pad_dims.nbDims;
+    for (int i = 0; i < dims; ++i) {
+        std::cout << pad_dims.d[i] << " ";
+    }
+    std::cout << std::endl;
 
     // Slice input
+//    int slice_c = int(C * shift_ratio);
+//    int slice_c2 = int(C * shift_ratio * 2);
+//
+//    auto* slice1_layer = TRT_ENGINE_ADD_LAYER(engine_,
+//                                              Slice,
+//                                              *pad_layer->getOutput(0),
+//                                              nvinfer1::Dims{5, {0, 0, 0, 0, 0}},
+//                                              nvinfer1::Dims{5, {N, slice_c, H, W, T}},
+//                                              nvinfer1::Dims{5, {1, 1, 1, 1, 1}});
+//    auto* slice2_layer = TRT_ENGINE_ADD_LAYER(engine_,
+//                                              Slice,
+//                                              *pad_layer->getOutput(0),
+//                                              nvinfer1::Dims{5, {0, slice_c, 0, 0, 2}},
+//                                              nvinfer1::Dims{5, {N, slice_c, H, W, T}},
+//                                              nvinfer1::Dims{5, {1, 1, 1, 1, 1}});
+//    auto* slice3_layer = TRT_ENGINE_ADD_LAYER(engine_,
+//                                              Slice,
+//                                              *pad_layer->getOutput(0),
+//                                              nvinfer1::Dims{5, {0, slice_c2, 0, 0, 1}},
+//                                              nvinfer1::Dims{5, {N, C - slice_c2, H, W, T}},
+//                                              nvinfer1::Dims{5, {1, 1, 1, 1, 1}});
     int slice_c = int(C * shift_ratio);
     int slice_c2 = int(C * shift_ratio * 2);
     auto* slice1_layer = TRT_ENGINE_ADD_LAYER(engine_,
                                               Slice,
-                                              *pad_layer->getOutput(0),
-                                              nvinfer1::Dims3{0, 0, 0},
-                                              nvinfer1::Dims3{T, slice_c, H},
-                                              nvinfer1::Dims3{1, 1, 1});
+                                              *reshape_layer2->getOutput(0),
+                                              nvinfer1::Dims{5, {0, 0, 0, 0, 0}},
+                                              nvinfer1::Dims{5, {N, T, slice_c, H, W}},
+                                              nvinfer1::Dims{5, {1, 1, 1, 1, 1}});
     auto* slice2_layer = TRT_ENGINE_ADD_LAYER(engine_,
                                               Slice,
-                                              *pad_layer->getOutput(0),
-                                              nvinfer1::Dims3{0, 2, 0},
-                                              nvinfer1::Dims3{T, slice_c2 - slice_c, H},
-                                              nvinfer1::Dims3{1, 1, 1});
+                                              *reshape_layer2->getOutput(0),
+                                              nvinfer1::Dims{5, {0, 2, slice_c, 0, 0}},
+                                              nvinfer1::Dims{5, {N, T, slice_c, H, W}},
+                                              nvinfer1::Dims{5, {1, 1, 1, 1, 1}});
     auto* slice3_layer = TRT_ENGINE_ADD_LAYER(engine_,
                                               Slice,
-                                              *pad_layer->getOutput(0),
-                                              nvinfer1::Dims3{0, 1, 0},
-                                              nvinfer1::Dims3{T, C - slice_c2, H},
-                                              nvinfer1::Dims3{1, 1, 1});
+                                              *reshape_layer2->getOutput(0),
+                                              nvinfer1::Dims{5, {0, 1, slice_c2, 0, 0}},
+                                              nvinfer1::Dims{5, {N, T, C - slice_c2, H, W}},
+                                              nvinfer1::Dims{5, {1, 1, 1, 1, 1}});
 
     // Concatenate slices along the third dimension (C)
-    nvinfer1::ITensor* concat_inputs[3] = {slice1_layer->getOutput(0),
-                                           slice2_layer->getOutput(0),
-                                           slice3_layer->getOutput(0)};
-    auto* concat_layer = TRT_ENGINE_ADD_LAYER(engine_,
+    nvinfer1::IConcatenationLayer* concat_layer;
+    if(!slice_c){
+        nvinfer1::ITensor* concat_inputs[2] = {slice2_layer->getOutput(0),
+                                               slice3_layer->getOutput(0)};
+        concat_layer = TRT_ENGINE_ADD_LAYER(engine_,
                                               Concatenation,
-                                              concat_inputs, 3);
-    concat_layer->setAxis(2);
+                                              concat_inputs, 2);
+        concat_layer->setAxis(2);
+    }
+    else{
+        nvinfer1::ITensor* concat_inputs[3] = {slice1_layer->getOutput(0),
+                                               slice2_layer->getOutput(0),
+                                               slice3_layer->getOutput(0)};
+        concat_layer = TRT_ENGINE_ADD_LAYER(engine_,
+                                                  Concatenation,
+                                                  concat_inputs, 3);
+        concat_layer->setAxis(2);
+    }
 
     // Reshape output to [N*T,C,H,W]
     nvinfer1::Dims output_shape{4, {N * T, C, H, W}};
-    auto* reshape_layer2 = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *concat_layer->getOutput(0));
-    reshape_layer2->setReshapeDimensions(output_shape);
+    auto* reshape_layer3 = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *concat_layer->getOutput(0));
+    reshape_layer3->setReshapeDimensions(output_shape);
 
     // Set output
     auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(reshape_layer2, "temporal_shift", {output_name}, test_mode);
+    RreplenishLayerAndOutput(reshape_layer3, "temporal_shift", {output_name}, test_mode);
 
   }
 };
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
index 4a1132f203c31f..016c4bbed56ac6 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
@@ -14,7 +14,7 @@
 
 import unittest
 from functools import partial
-from typing import Any, Dict, List
+from typing import List
 
 import numpy as np
 from program_config import ProgramConfig, TensorConfig
@@ -28,10 +28,9 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-        def generate_input1(attrs: List[Dict[str, Any]]):
-            T = attrs[0]['seg_num']
-            NT = 3 * T
-            return np.random.random([NT, 4, 32, 32]).astype(np.float32)
+        def generate_input1(attrs):
+            T = attrs[0]["seg_num"]
+            return np.ones([3 * T, 10, 64, 64]).astype(np.float32)
 
         for shift_value in [0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50]:
             for T in range(2, 5):
@@ -41,22 +40,23 @@ def generate_input1(attrs: List[Dict[str, Any]]):
                     {
                         "op_type": "temporal_shift",
                         "op_inputs": {"X": ["input_data"]},
-                        "op_outputs": {"Out": ["temporal_shift_output_data"]},
+                        "op_outputs": {"Out": ["output_data"]},
                         "op_attrs": dics[0],
                     }
                 ]
-                ops = self.generate_op_config(ops_config)
 
-                program_config = ProgramConfig(
-                    ops=ops,
-                    weights={},
-                    inputs={
-                        "input_data": TensorConfig(
-                            data_gen=partial(generate_input1, dics)
-                        )
-                    },
-                    outputs=["temporal_shift_output_data"],
-                )
+                ops = self.generate_op_config(ops_config)
+                for i in range(10):
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input_data": TensorConfig(
+                                data_gen=partial(generate_input1, dics)
+                            ),
+                        },
+                        outputs=["output_data"],
+                    )
 
                 yield program_config
 
@@ -64,50 +64,38 @@ def sample_predictor_configs(
             self, program_config
     ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
-            T = attrs[0]['seg_num']
             self.dynamic_shape.min_input_shape = {
-                "input_data": [1 * T, 3, 32, 32]
+                "input_data": [6, 10, 64, 64]
             }
             self.dynamic_shape.max_input_shape = {
-                "input_data": [3 * T, 3, 64, 64]
+                "input_data": [20, 10, 64, 64]
             }
             self.dynamic_shape.opt_input_shape = {
-                "input_data": [1 * T, 3, 64, 64]
+                "input_data": [6, 10, 64, 64]
             }
 
         def clear_dynamic_shape():
-            self.dynamic_shape.min_input_shape = {}
             self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
             self.dynamic_shape.opt_input_shape = {}
 
-        def generate_trt_nodes_num(attrs, dynamic_shape):
-            return 1, 2
-
         attrs = [
             program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
-        # for static_shape
-        clear_dynamic_shape()
-        self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False
-        ), 1e-5
-        self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False
-        ), (1e-3, 1e-3)
+        # # for static_shape
+        # clear_dynamic_shape()
+        # self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        # yield self.create_inference_config(), (1, 3), 1e-5
+        # self.trt_param.precision = paddle_infer.PrecisionType.Half
+        # yield self.create_inference_config(), (1, 3), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True
-        ), 1e-5
+        yield self.create_inference_config(), (1, 2), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True
-        ), (1e-3, 1e-3)
+        yield self.create_inference_config(), (1, 2), 1e-3
 
     def test(self):
         self.run_test()

From 053d2dd71a661214e07b6e42570c1a4a47725981 Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Sun, 5 Mar 2023 21:08:38 +0800
Subject: [PATCH 04/17] update pad codes

---
 .../tensorrt/convert/temporal_shift_op.cc     | 192 ++++++++++--------
 1 file changed, 112 insertions(+), 80 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc b/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
index b2562594965a99..8381d80cc61be9 100644
--- a/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
@@ -41,7 +41,8 @@ class TemporalShiftOpConverter : public OpConverter {
     // Declare inputs
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
 
-    const float shift_ratio = PADDLE_GET_CONST(float, op_desc.GetAttr("shift_ratio"));
+    const float shift_ratio =
+        PADDLE_GET_CONST(float, op_desc.GetAttr("shift_ratio"));
     const int T = PADDLE_GET_CONST(int, op_desc.GetAttr("seg_num"));
 
     auto input_dims = input->getDimensions();
@@ -52,103 +53,134 @@ class TemporalShiftOpConverter : public OpConverter {
     const int W = input_dims.d[3];
     const int N = NT / T;
 
-    // Reshape input to [N,C,H,W,T]
+    // Reshape input to [N,T,C,H,W]
     auto reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
-    nvinfer1::Dims reshape_dims{5, {N, C, H, W, T}};
+    nvinfer1::Dims reshape_dims{5, {N, T, C, H, W}};
     reshape_layer->setReshapeDimensions(reshape_dims);
 
-    // Pad input
-    auto* pad_layer = TRT_ENGINE_ADD_LAYER(engine_,
-                                           Padding,
-                                           *reshape_layer->getOutput(0),
-                                           nvinfer1::DimsHW{0, 1},
-                                           nvinfer1::DimsHW{0, 1});
-
-    // Reshape input to [N,T,C,H,W]
-    auto reshape_layer2 = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *pad_layer->getOutput(0));
-    nvinfer1::Dims reshape_dims2{5, {N, T + 2, C, H, W}};
-    reshape_layer2->setReshapeDimensions(reshape_dims2);
-
-    // print pad_layer->getOutput(0)->getDimensions()
-    auto pad_dims = pad_layer->getOutput(0)->getDimensions();
-    int dims = pad_dims.nbDims;
-    for (int i = 0; i < dims; ++i) {
-        std::cout << pad_dims.d[i] << " ";
+    // Pad input to [N,T+2,C,H,W]
+    std::vector<int> pre_pad_v{0, 1, 0, 0, 0};
+    std::vector<int> post_pad_v{0, 1, 0, 0, 0};
+    nvinfer1::ITensor* pre_pad = vectorToTensor<int>(pre_pad_v);
+    nvinfer1::ITensor* post_pad = vectorToTensor<int>(post_pad_v);
+
+    std::vector<int> zeros_v(inputDim, 0);
+    auto const zeros = vectorToTensor<int>(zeros_v);
+
+    nvinfer1::ITensor* start{};
+    nvinfer1::ITensor* size{};
+    // elementwise add zeros and pre_pad
+    start = TRT_ENGINE_ADD_LAYER(engine_,
+                                 ElementWise,
+                                 *zeros,
+                                 *pre_pad,
+                                 nvinfer1::ElementWiseOperation::kSUB)
+                ->getOutput(0);
+
+    auto const total_padding =
+        TRT_ENGINE_ADD_LAYER(engine_,
+                             ElementWise,
+                             *pre_pad,
+                             *post_pad,
+                             nvinfer1::ElementWiseOperation::kSUM)
+            ->getOutput(0);
+
+    std::vector<int> input_shape_v(inputDim, 0);
+    for (int i = 0; i < inputDim; i++) {
+      input_shape_v[i] = input->getDimensions().d[i];
     }
-    std::cout << std::endl;
-
-    // Slice input
-//    int slice_c = int(C * shift_ratio);
-//    int slice_c2 = int(C * shift_ratio * 2);
-//
-//    auto* slice1_layer = TRT_ENGINE_ADD_LAYER(engine_,
-//                                              Slice,
-//                                              *pad_layer->getOutput(0),
-//                                              nvinfer1::Dims{5, {0, 0, 0, 0, 0}},
-//                                              nvinfer1::Dims{5, {N, slice_c, H, W, T}},
-//                                              nvinfer1::Dims{5, {1, 1, 1, 1, 1}});
-//    auto* slice2_layer = TRT_ENGINE_ADD_LAYER(engine_,
-//                                              Slice,
-//                                              *pad_layer->getOutput(0),
-//                                              nvinfer1::Dims{5, {0, slice_c, 0, 0, 2}},
-//                                              nvinfer1::Dims{5, {N, slice_c, H, W, T}},
-//                                              nvinfer1::Dims{5, {1, 1, 1, 1, 1}});
-//    auto* slice3_layer = TRT_ENGINE_ADD_LAYER(engine_,
-//                                              Slice,
-//                                              *pad_layer->getOutput(0),
-//                                              nvinfer1::Dims{5, {0, slice_c2, 0, 0, 1}},
-//                                              nvinfer1::Dims{5, {N, C - slice_c2, H, W, T}},
-//                                              nvinfer1::Dims{5, {1, 1, 1, 1, 1}});
+    auto const input_shape = vectorToTensor<int>(input_shape_v);
+
+    size = TRT_ENGINE_ADD_LAYER(engine_,
+                                ElementWise,
+                                *input_shape,
+                                *total_padding,
+                                nvinfer1::ElementWiseOperation::kSUM)
+               ->getOutput(0);
+    nvinfer1::Dims stride;
+    stride.nbDims = inputDim;
+    std::fill_n(stride.d, inputDim, 1);
+    auto const& dummy = stride;
+    auto* slice_layer =
+        TRT_ENGINE_ADD_LAYER(engine_,
+                             Slice,
+                             *const_cast<nvinfer1::ITensor*>(input),
+                             dummy,
+                             dummy,
+                             stride);
+    slice_layer->setInput(1, *start);
+    slice_layer->setInput(2, *size);
+    slice_layer->setMode(nvinfer1::SliceMode::kFILL);
+
+
+    // Slice Padded Tensor
     int slice_c = int(C * shift_ratio);
     int slice_c2 = int(C * shift_ratio * 2);
-    auto* slice1_layer = TRT_ENGINE_ADD_LAYER(engine_,
-                                              Slice,
-                                              *reshape_layer2->getOutput(0),
-                                              nvinfer1::Dims{5, {0, 0, 0, 0, 0}},
-                                              nvinfer1::Dims{5, {N, T, slice_c, H, W}},
-                                              nvinfer1::Dims{5, {1, 1, 1, 1, 1}});
-    auto* slice2_layer = TRT_ENGINE_ADD_LAYER(engine_,
-                                              Slice,
-                                              *reshape_layer2->getOutput(0),
-                                              nvinfer1::Dims{5, {0, 2, slice_c, 0, 0}},
-                                              nvinfer1::Dims{5, {N, T, slice_c, H, W}},
-                                              nvinfer1::Dims{5, {1, 1, 1, 1, 1}});
-    auto* slice3_layer = TRT_ENGINE_ADD_LAYER(engine_,
-                                              Slice,
-                                              *reshape_layer2->getOutput(0),
-                                              nvinfer1::Dims{5, {0, 1, slice_c2, 0, 0}},
-                                              nvinfer1::Dims{5, {N, T, C - slice_c2, H, W}},
-                                              nvinfer1::Dims{5, {1, 1, 1, 1, 1}});
+    auto* slice1_layer =
+        TRT_ENGINE_ADD_LAYER(engine_,
+                             Slice,
+                             *slice_layer->getOutput(0),
+                             nvinfer1::Dims{5, {0, 0, 0, 0, 0}},
+                             nvinfer1::Dims{5, {N, T, slice_c, H, W}},
+                             nvinfer1::Dims{5, {1, 1, 1, 1, 1}});
+    auto* slice2_layer =
+        TRT_ENGINE_ADD_LAYER(engine_,
+                             Slice,
+                             *slice_layer->getOutput(0),
+                             nvinfer1::Dims{5, {0, 2, slice_c, 0, 0}},
+                             nvinfer1::Dims{5, {N, T, slice_c, H, W}},
+                             nvinfer1::Dims{5, {1, 1, 1, 1, 1}});
+    auto* slice3_layer =
+        TRT_ENGINE_ADD_LAYER(engine_,
+                             Slice,
+                             *slice_layer->getOutput(0),
+                             nvinfer1::Dims{5, {0, 1, slice_c2, 0, 0}},
+                             nvinfer1::Dims{5, {N, T, C - slice_c2, H, W}},
+                             nvinfer1::Dims{5, {1, 1, 1, 1, 1}});
 
     // Concatenate slices along the third dimension (C)
     nvinfer1::IConcatenationLayer* concat_layer;
-    if(!slice_c){
-        nvinfer1::ITensor* concat_inputs[2] = {slice2_layer->getOutput(0),
-                                               slice3_layer->getOutput(0)};
-        concat_layer = TRT_ENGINE_ADD_LAYER(engine_,
-                                              Concatenation,
-                                              concat_inputs, 2);
-        concat_layer->setAxis(2);
-    }
-    else{
-        nvinfer1::ITensor* concat_inputs[3] = {slice1_layer->getOutput(0),
-                                               slice2_layer->getOutput(0),
-                                               slice3_layer->getOutput(0)};
-        concat_layer = TRT_ENGINE_ADD_LAYER(engine_,
-                                                  Concatenation,
-                                                  concat_inputs, 3);
-        concat_layer->setAxis(2);
+    if (!slice_c) {
+      nvinfer1::ITensor* concat_inputs[2] = {slice2_layer->getOutput(0),
+                                             slice3_layer->getOutput(0)};
+      concat_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Concatenation, concat_inputs, 2);
+      concat_layer->setAxis(2);
+    } else {
+      nvinfer1::ITensor* concat_inputs[3] = {slice1_layer->getOutput(0),
+                                             slice2_layer->getOutput(0),
+                                             slice3_layer->getOutput(0)};
+      concat_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Concatenation, concat_inputs, 3);
+      concat_layer->setAxis(2);
     }
 
     // Reshape output to [N*T,C,H,W]
     nvinfer1::Dims output_shape{4, {N * T, C, H, W}};
-    auto* reshape_layer3 = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *concat_layer->getOutput(0));
+    auto* reshape_layer3 =
+        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *concat_layer->getOutput(0));
     reshape_layer3->setReshapeDimensions(output_shape);
 
     // Set output
     auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(reshape_layer3, "temporal_shift", {output_name}, test_mode);
+    RreplenishLayerAndOutput(
+        reshape_layer3, "temporal_shift", {output_name}, test_mode);
+  }
+
+ private:
+  template <typename T>
+  nvinfer1::ITensor* vectorToTensor(std::vector<T> v) {
+    int* v_data = const_cast<T*>(static_cast<const T*>(v.data()));
+
+    nvinfer1::Weights v_wt{nvinfer1::DataType::kINT32,
+                           static_cast<void*>(v_data),
+                           static_cast<int32_t>(v.size())};
+
+    nvinfer1::Dims v_dim;
+    v_dim.nbDims = 1;
+    v_dim.d[0] = static_cast<int>(v.size());
 
+    return TRT_ENGINE_ADD_LAYER(engine_, Constant, v_dim, v_wt)->getOutput(0);
   }
 };
 

From ac677a2fbf6e904e5a4f0f5839b0286044c6dabd Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Sun, 5 Mar 2023 22:47:04 +0800
Subject: [PATCH 05/17] update test codes

---
 .../tensorrt/convert/temporal_shift_op.cc          | 14 +++++++-------
 paddle/fluid/inference/tensorrt/op_teller.cc       |  4 ++--
 .../inference/test_trt_convert_temporal_shift.py   |  6 +++---
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc b/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
index 8381d80cc61be9..9b91de0a45c275 100644
--- a/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
@@ -64,12 +64,13 @@ class TemporalShiftOpConverter : public OpConverter {
     nvinfer1::ITensor* pre_pad = vectorToTensor<int>(pre_pad_v);
     nvinfer1::ITensor* post_pad = vectorToTensor<int>(post_pad_v);
 
-    std::vector<int> zeros_v(inputDim, 0);
+    int dims = 5;
+    std::vector<int> zeros_v(dims, 0);
     auto const zeros = vectorToTensor<int>(zeros_v);
 
     nvinfer1::ITensor* start{};
     nvinfer1::ITensor* size{};
-    // elementwise add zeros and pre_pad
+
     start = TRT_ENGINE_ADD_LAYER(engine_,
                                  ElementWise,
                                  *zeros,
@@ -85,8 +86,8 @@ class TemporalShiftOpConverter : public OpConverter {
                              nvinfer1::ElementWiseOperation::kSUM)
             ->getOutput(0);
 
-    std::vector<int> input_shape_v(inputDim, 0);
-    for (int i = 0; i < inputDim; i++) {
+    std::vector<int> input_shape_v(dims, 0);
+    for (int i = 0; i < dims; i++) {
       input_shape_v[i] = input->getDimensions().d[i];
     }
     auto const input_shape = vectorToTensor<int>(input_shape_v);
@@ -98,8 +99,8 @@ class TemporalShiftOpConverter : public OpConverter {
                                 nvinfer1::ElementWiseOperation::kSUM)
                ->getOutput(0);
     nvinfer1::Dims stride;
-    stride.nbDims = inputDim;
-    std::fill_n(stride.d, inputDim, 1);
+    stride.nbDims = dims;
+    std::fill_n(stride.d, dims, 1);
     auto const& dummy = stride;
     auto* slice_layer =
         TRT_ENGINE_ADD_LAYER(engine_,
@@ -112,7 +113,6 @@ class TemporalShiftOpConverter : public OpConverter {
     slice_layer->setInput(2, *size);
     slice_layer->setMode(nvinfer1::SliceMode::kFILL);
 
-
     // Slice Padded Tensor
     int slice_c = int(C * shift_ratio);
     int slice_c2 = int(C * shift_ratio * 2);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 99c50b4338b43a..9301d184f80be0 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -2739,7 +2739,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "fuse_eleadd_transpose",
       "skip_groupnorm_act",
       "preln_groupnorm_act",
-      "temporal_shift"};
+      "temporal_shift",
       "grid_sampler"};
 
   std::unordered_set<std::string> teller_set{
@@ -2893,7 +2893,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "fuse_eleadd_transpose",
       "skip_groupnorm_act",
       "preln_groupnorm_act",
-      "temporal_shift"};
+      "temporal_shift"
       "grid_sampler"};
 };
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
index 016c4bbed56ac6..123d993daf2472 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
@@ -32,7 +32,7 @@ def generate_input1(attrs):
             T = attrs[0]["seg_num"]
             return np.ones([3 * T, 10, 64, 64]).astype(np.float32)
 
-        for shift_value in [0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50]:
+        for shift_value in [0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.49]:
             for T in range(2, 5):
                 dics = [{"shift_ratio": shift_value, "seg_num": T}, {}]
 
@@ -93,9 +93,9 @@ def clear_dynamic_shape():
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (1, 2), 1e-5
+        yield self.create_inference_config(), (0, 3), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 2), 1e-3
+        yield self.create_inference_config(), (0, 3), 1e-3
 
     def test(self):
         self.run_test()

From d7af5e3883c57b2efbb6161c2af82efc12ddbd0a Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Sun, 5 Mar 2023 22:52:52 +0800
Subject: [PATCH 06/17] add trt version limite

---
 paddle/fluid/inference/api/analysis_predictor.cc | 2 +-
 paddle/fluid/inference/tensorrt/op_teller.cc     | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 7dc3a48e5e0d6d..213b6a61b74c32 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2469,7 +2469,6 @@ USE_TRT_CONVERTER(mish);
 USE_TRT_CONVERTER(deformable_conv);
 USE_TRT_CONVERTER(pool3d)
 USE_TRT_CONVERTER(square);
-USE_TRT_CONVERTER(temporal_shift);
 // unary op
 USE_TRT_CONVERTER(exp);
 USE_TRT_CONVERTER(log);
@@ -2545,6 +2544,7 @@ USE_TRT_CONVERTER(grid_sampler)
 #endif
 #if IS_TRT_VERSION_GE(8200)
 USE_TRT_CONVERTER(set_value)
+USE_TRT_CONVERTER(temporal_shift);
 #endif
 #if PADDLE_WITH_CUSPARSELT && IS_TRT_VERSION_GE(8000)
 USE_TRT_CONVERTER(sparse_fc)
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 9301d184f80be0..ab1a8668e9c996 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -2579,6 +2579,13 @@ struct SimpleOpTypeSetTeller : public Teller {
 #endif
     }
 
+    if (op_type == "temporal_shift") {
+#if !IS_TRT_VERSION_GE(8200)
+      VLOG(3) << "temporal_shift is not supported when TensorRT < 8.5.1";
+      return false;
+#endif
+    }
+
     if (use_no_calib_int8) {
       return int8_teller_set.count(op_type);
     } else {

From a2a0416f5b4b4f887a3abf43aad046b1c3911be2 Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Mon, 6 Mar 2023 17:37:26 +0800
Subject: [PATCH 07/17] =?UTF-8?q?=E6=9B=B4=E6=96=B0trt=20=E7=89=88?=
 =?UTF-8?q?=E6=9C=AC=E6=8E=A7=E5=88=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../tensorrt/convert/temporal_shift_op.cc     | 93 ++++++++++---------
 paddle/fluid/inference/tensorrt/op_teller.cc  |  2 +-
 .../test_trt_convert_temporal_shift.py        | 51 +++++-----
 3 files changed, 76 insertions(+), 70 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc b/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
index 9b91de0a45c275..e8092302b30c88 100644
--- a/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
@@ -36,6 +36,7 @@ class TemporalShiftOpConverter : public OpConverter {
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope,
                   bool test_mode) override {
+#if IS_TRT_VERSION_GE(8200)
     VLOG(3) << "convert a fluid transpose op to tensorrt tranpose layer";
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
@@ -45,6 +46,20 @@ class TemporalShiftOpConverter : public OpConverter {
         PADDLE_GET_CONST(float, op_desc.GetAttr("shift_ratio"));
     const int T = PADDLE_GET_CONST(int, op_desc.GetAttr("seg_num"));
 
+    std::string data_format = "NCHW";
+    if (op_desc.HasAttr("data_format")) {
+      data_format =
+          PADDLE_GET_CONST(std::string, op_desc.GetAttr("data_format"));
+    }
+
+    if (data_format == "NHWC") {
+      // tanspose input to [N,C,H,W]
+      auto transpose_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+      nvinfer1::Permutation perm{0, 3, 1, 2};
+      transpose_layer->setFirstTranspose(perm);
+      input = transpose_layer->getOutput(0);
+    }
+
     auto input_dims = input->getDimensions();
 
     const int NT = input_dims.d[0];
@@ -55,18 +70,18 @@ class TemporalShiftOpConverter : public OpConverter {
 
     // Reshape input to [N,T,C,H,W]
     auto reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
-    nvinfer1::Dims reshape_dims{5, {N, T, C, H, W}};
+    nvinfer1::Dims reshape_dims{5, { N, T, C, H, W }};
     reshape_layer->setReshapeDimensions(reshape_dims);
 
     // Pad input to [N,T+2,C,H,W]
     std::vector<int> pre_pad_v{0, 1, 0, 0, 0};
     std::vector<int> post_pad_v{0, 1, 0, 0, 0};
-    nvinfer1::ITensor* pre_pad = vectorToTensor<int>(pre_pad_v);
-    nvinfer1::ITensor* post_pad = vectorToTensor<int>(post_pad_v);
+    nvinfer1::ITensor* pre_pad = Add1DConstantLayer(pre_pad_v);
+    nvinfer1::ITensor* post_pad = Add1DConstantLayer(post_pad_v);
 
     int dims = 5;
     std::vector<int> zeros_v(dims, 0);
-    auto const zeros = vectorToTensor<int>(zeros_v);
+    auto const zeros = Add1DConstantLayer(zeros_v);
 
     nvinfer1::ITensor* start{};
     nvinfer1::ITensor* size{};
@@ -90,7 +105,7 @@ class TemporalShiftOpConverter : public OpConverter {
     for (int i = 0; i < dims; i++) {
       input_shape_v[i] = input->getDimensions().d[i];
     }
-    auto const input_shape = vectorToTensor<int>(input_shape_v);
+    auto const input_shape = Add1DConstantLayer(input_shape_v);
 
     size = TRT_ENGINE_ADD_LAYER(engine_,
                                 ElementWise,
@@ -116,27 +131,19 @@ class TemporalShiftOpConverter : public OpConverter {
     // Slice Padded Tensor
     int slice_c = int(C * shift_ratio);
     int slice_c2 = int(C * shift_ratio * 2);
-    auto* slice1_layer =
-        TRT_ENGINE_ADD_LAYER(engine_,
-                             Slice,
-                             *slice_layer->getOutput(0),
-                             nvinfer1::Dims{5, {0, 0, 0, 0, 0}},
-                             nvinfer1::Dims{5, {N, T, slice_c, H, W}},
-                             nvinfer1::Dims{5, {1, 1, 1, 1, 1}});
-    auto* slice2_layer =
-        TRT_ENGINE_ADD_LAYER(engine_,
-                             Slice,
-                             *slice_layer->getOutput(0),
-                             nvinfer1::Dims{5, {0, 2, slice_c, 0, 0}},
-                             nvinfer1::Dims{5, {N, T, slice_c, H, W}},
-                             nvinfer1::Dims{5, {1, 1, 1, 1, 1}});
-    auto* slice3_layer =
-        TRT_ENGINE_ADD_LAYER(engine_,
-                             Slice,
-                             *slice_layer->getOutput(0),
-                             nvinfer1::Dims{5, {0, 1, slice_c2, 0, 0}},
-                             nvinfer1::Dims{5, {N, T, C - slice_c2, H, W}},
-                             nvinfer1::Dims{5, {1, 1, 1, 1, 1}});
+    auto slice_start1 = nvinfer1::Dims{5, { 0, 0, 0, 0, 0 }};
+    auto slice_start2 = nvinfer1::Dims{5, { 0, 2, slice_c, 0, 0 }};
+    auto slice_start3 = nvinfer1::Dims{5, { 0, 1, slice_c2, 0, 0 }};
+    auto slice_size = nvinfer1::Dims{5, { N, T, slice_c, H, W }};
+    auto slice_size2 = nvinfer1::Dims{5, { N, T, C - slice_c2, H, W }};
+    auto slice_stride = nvinfer1::Dims{5, { 1, 1, 1, 1, 1 }};
+
+    auto* slice1_layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Slice, *slice_layer->getOutput(0), slice_start1, slice_size, slice_stride);
+    auto* slice2_layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Slice, *slice_layer->getOutput(0), slice_start2, slice_size, slice_stride);
+    auto* slice3_layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Slice, *slice_layer->getOutput(0), slice_start3, slice_size2, slice_stride);
 
     // Concatenate slices along the third dimension (C)
     nvinfer1::IConcatenationLayer* concat_layer;
@@ -156,31 +163,29 @@ class TemporalShiftOpConverter : public OpConverter {
     }
 
     // Reshape output to [N*T,C,H,W]
-    nvinfer1::Dims output_shape{4, {N * T, C, H, W}};
+    nvinfer1::Dims output_shape{4, { N * T, C, H, W }};
     auto* reshape_layer3 =
         TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *concat_layer->getOutput(0));
     reshape_layer3->setReshapeDimensions(output_shape);
 
     // Set output
     auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(
-        reshape_layer3, "temporal_shift", {output_name}, test_mode);
-  }
-
- private:
-  template <typename T>
-  nvinfer1::ITensor* vectorToTensor(std::vector<T> v) {
-    int* v_data = const_cast<T*>(static_cast<const T*>(v.data()));
 
-    nvinfer1::Weights v_wt{nvinfer1::DataType::kINT32,
-                           static_cast<void*>(v_data),
-                           static_cast<int32_t>(v.size())};
-
-    nvinfer1::Dims v_dim;
-    v_dim.nbDims = 1;
-    v_dim.d[0] = static_cast<int>(v.size());
-
-    return TRT_ENGINE_ADD_LAYER(engine_, Constant, v_dim, v_wt)->getOutput(0);
+    if (data_format == "NHWC") {
+      // Transpose output to [N*T,C,H,W] -> [N*T,H,W,C]
+      auto transpose_layer2 =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *reshape_layer3->getOutput(0));
+      nvinfer1::Permutation permute_order{0, 2, 3, 1};
+      transpose_layer2->setFirstTranspose(permute_order);
+      RreplenishLayerAndOutput(
+          transpose_layer2, "temporal_shift", {output_name}, test_mode);
+    } else {
+      RreplenishLayerAndOutput(
+          reshape_layer3, "temporal_shift", {output_name}, test_mode);
+    }
+#else
+    VLOG(3) << "Temporal shift is not supported when TensorRT < 8.2";
+#endif
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index ab1a8668e9c996..8efe83c108a272 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -2581,7 +2581,7 @@ struct SimpleOpTypeSetTeller : public Teller {
 
     if (op_type == "temporal_shift") {
 #if !IS_TRT_VERSION_GE(8200)
-      VLOG(3) << "temporal_shift is not supported when TensorRT < 8.5.1";
+      VLOG(3) << "temporal_shift is not supported when TensorRT < 8.2";
       return false;
 #endif
     }
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
index 123d993daf2472..0facb6dcd95369 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
@@ -34,31 +34,32 @@ def generate_input1(attrs):
 
         for shift_value in [0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.49]:
             for T in range(2, 5):
-                dics = [{"shift_ratio": shift_value, "seg_num": T}, {}]
-
-                ops_config = [
-                    {
-                        "op_type": "temporal_shift",
-                        "op_inputs": {"X": ["input_data"]},
-                        "op_outputs": {"Out": ["output_data"]},
-                        "op_attrs": dics[0],
-                    }
-                ]
-
-                ops = self.generate_op_config(ops_config)
-                for i in range(10):
-                    program_config = ProgramConfig(
-                        ops=ops,
-                        weights={},
-                        inputs={
-                            "input_data": TensorConfig(
-                                data_gen=partial(generate_input1, dics)
-                            ),
-                        },
-                        outputs=["output_data"],
-                    )
-
-                yield program_config
+                for data_format in ["NCHW", "NHWC"]:
+                    dics = [{"shift_ratio": shift_value, "seg_num": T, "data_format": data_format}, {}]
+
+                    ops_config = [
+                        {
+                            "op_type": "temporal_shift",
+                            "op_inputs": {"X": ["input_data"]},
+                            "op_outputs": {"Out": ["output_data"]},
+                            "op_attrs": dics[0],
+                        }
+                    ]
+
+                    ops = self.generate_op_config(ops_config)
+                    for i in range(10):
+                        program_config = ProgramConfig(
+                            ops=ops,
+                            weights={},
+                            inputs={
+                                "input_data": TensorConfig(
+                                    data_gen=partial(generate_input1, dics)
+                                ),
+                            },
+                            outputs=["output_data"],
+                        )
+
+                    yield program_config
 
     def sample_predictor_configs(
             self, program_config

From fcb6504161037a69fc45a13145075a89967e7e15 Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Mon, 6 Mar 2023 21:37:58 +0800
Subject: [PATCH 08/17] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E5=8D=95=E6=B5=8B?=
 =?UTF-8?q?=E6=96=87=E4=BB=B6[=E9=9D=99=E6=80=81shape=E5=8D=95=E6=B5=8B?=
 =?UTF-8?q?=E6=AD=A3=E5=B8=B8=E9=80=9A=E8=BF=87]?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 paddle/fluid/framework/ir/trt_support_nhwc_pass.cc  |  7 ++++---
 .../ir/inference/test_trt_convert_temporal_shift.py | 13 ++++++-------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc b/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc
index 3e56200dcaa52c..142bc5d601d89e 100644
--- a/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc
+++ b/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc
@@ -154,11 +154,12 @@ void TrtSupportNHWCPass::ApplyImpl(Graph *graph) const {
                                                     "bilinear_interp",
                                                     "bilinear_interp_v2",
                                                     "nearest_interp",
-                                                    "nearest_interp_v2"};
+                                                    "nearest_interp_v2",
+                                                    "temporal_shift"};
   // Ops must run under the original layout even though it has
   // data_format/data_layout attribute, otherwise it will be very troublesome!
-  std::unordered_set<std::string> must_original_layout_ops{"affine_channel",
-                                                           "softmax"};
+  std::unordered_set<std::string> must_original_layout_ops{
+      "affine_channel", "softmax", "temporal_shift"};
   // OPs unrelated to layout are consistent according to the layout of input
   // var！
   std::unordered_set<std::string> any_layout_ops{"relu"};
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
index 0facb6dcd95369..3c9ab410d9acfb 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
@@ -36,7 +36,6 @@ def generate_input1(attrs):
             for T in range(2, 5):
                 for data_format in ["NCHW", "NHWC"]:
                     dics = [{"shift_ratio": shift_value, "seg_num": T, "data_format": data_format}, {}]
-
                     ops_config = [
                         {
                             "op_type": "temporal_shift",
@@ -84,12 +83,12 @@ def clear_dynamic_shape():
             program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
-        # # for static_shape
-        # clear_dynamic_shape()
-        # self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        # yield self.create_inference_config(), (1, 3), 1e-5
-        # self.trt_param.precision = paddle_infer.PrecisionType.Half
-        # yield self.create_inference_config(), (1, 3), 1e-3
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (0, 3), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (0, 3), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)

From 6320c024acf441a73dfda7b01cdb4c53b70da66b Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Tue, 7 Mar 2023 20:52:10 +0800
Subject: [PATCH 09/17] fix code style

---
 .../tensorrt/convert/temporal_shift_op.cc     | 28 +++++++++++++------
 .../test_trt_convert_temporal_shift.py        | 19 +++++++------
 2 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc b/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
index e8092302b30c88..651a2b04df89d9 100644
--- a/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
@@ -129,8 +129,8 @@ class TemporalShiftOpConverter : public OpConverter {
     slice_layer->setMode(nvinfer1::SliceMode::kFILL);
 
     // Slice Padded Tensor
-    int slice_c = int(C * shift_ratio);
-    int slice_c2 = int(C * shift_ratio * 2);
+    int slice_c = static_cast<int>(C * shift_ratio);
+    int slice_c2 = static_cast<int>(C * shift_ratio * 2);
     auto slice_start1 = nvinfer1::Dims{5, { 0, 0, 0, 0, 0 }};
     auto slice_start2 = nvinfer1::Dims{5, { 0, 2, slice_c, 0, 0 }};
     auto slice_start3 = nvinfer1::Dims{5, { 0, 1, slice_c2, 0, 0 }};
@@ -138,12 +138,24 @@ class TemporalShiftOpConverter : public OpConverter {
     auto slice_size2 = nvinfer1::Dims{5, { N, T, C - slice_c2, H, W }};
     auto slice_stride = nvinfer1::Dims{5, { 1, 1, 1, 1, 1 }};
 
-    auto* slice1_layer = TRT_ENGINE_ADD_LAYER(
-        engine_, Slice, *slice_layer->getOutput(0), slice_start1, slice_size, slice_stride);
-    auto* slice2_layer = TRT_ENGINE_ADD_LAYER(
-        engine_, Slice, *slice_layer->getOutput(0), slice_start2, slice_size, slice_stride);
-    auto* slice3_layer = TRT_ENGINE_ADD_LAYER(
-        engine_, Slice, *slice_layer->getOutput(0), slice_start3, slice_size2, slice_stride);
+    auto* slice1_layer = TRT_ENGINE_ADD_LAYER(engine_,
+                                              Slice,
+                                              *slice_layer->getOutput(0),
+                                              slice_start1,
+                                              slice_size,
+                                              slice_stride);
+    auto* slice2_layer = TRT_ENGINE_ADD_LAYER(engine_,
+                                              Slice,
+                                              *slice_layer->getOutput(0),
+                                              slice_start2,
+                                              slice_size,
+                                              slice_stride);
+    auto* slice3_layer = TRT_ENGINE_ADD_LAYER(engine_,
+                                              Slice,
+                                              *slice_layer->getOutput(0),
+                                              slice_start3,
+                                              slice_size2,
+                                              slice_stride);
 
     // Concatenate slices along the third dimension (C)
     nvinfer1::IConcatenationLayer* concat_layer;
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
index 3c9ab410d9acfb..137d75a306ecd4 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
@@ -35,7 +35,14 @@ def generate_input1(attrs):
         for shift_value in [0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.49]:
             for T in range(2, 5):
                 for data_format in ["NCHW", "NHWC"]:
-                    dics = [{"shift_ratio": shift_value, "seg_num": T, "data_format": data_format}, {}]
+                    dics = [
+                        {
+                            "shift_ratio": shift_value,
+                            "seg_num": T,
+                            "data_format": data_format,
+                        },
+                        {},
+                    ]
                     ops_config = [
                         {
                             "op_type": "temporal_shift",
@@ -61,18 +68,14 @@ def generate_input1(attrs):
                     yield program_config
 
     def sample_predictor_configs(
-            self, program_config
+        self, program_config
     ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
-            self.dynamic_shape.min_input_shape = {
-                "input_data": [6, 10, 64, 64]
-            }
+            self.dynamic_shape.min_input_shape = {"input_data": [6, 10, 64, 64]}
             self.dynamic_shape.max_input_shape = {
                 "input_data": [20, 10, 64, 64]
             }
-            self.dynamic_shape.opt_input_shape = {
-                "input_data": [6, 10, 64, 64]
-            }
+            self.dynamic_shape.opt_input_shape = {"input_data": [6, 10, 64, 64]}
 
         def clear_dynamic_shape():
             self.dynamic_shape.max_input_shape = {}

From b6656c7e184f637be6bb3f541b239fea3b104860 Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Thu, 9 Mar 2023 16:10:21 +0800
Subject: [PATCH 10/17] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E5=8D=95=E6=B5=8B?=
 =?UTF-8?q?=E6=96=87=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../inference/test_trt_convert_temporal_shift.py  | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
index 137d75a306ecd4..e3e2a2c3b3876e 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
@@ -30,7 +30,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
     def sample_program_configs(self):
         def generate_input1(attrs):
             T = attrs[0]["seg_num"]
-            return np.ones([3 * T, 10, 64, 64]).astype(np.float32)
+            return np.random.rand(3 * T, 10, 64, 64).astype(np.float32)
 
         for shift_value in [0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.49]:
             for T in range(2, 5):
@@ -68,7 +68,7 @@ def generate_input1(attrs):
                     yield program_config
 
     def sample_predictor_configs(
-        self, program_config
+            self, program_config
     ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [6, 10, 64, 64]}
@@ -82,6 +82,9 @@ def clear_dynamic_shape():
             self.dynamic_shape.min_input_shape = {}
             self.dynamic_shape.opt_input_shape = {}
 
+        def generate_trt_nodes_num(attrs, is_dynamic_shape):
+            return 0, 3
+
         attrs = [
             program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
@@ -89,16 +92,16 @@ def clear_dynamic_shape():
         # for static_shape
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (0, 3), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs, False), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (0, 3), 1e-3
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs, False), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (0, 3), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (0, 3), 1e-3
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs, True), 1e-3
 
     def test(self):
         self.run_test()

From 15087cbbd7c510f2027b0708f020b473fb1151ab Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Thu, 9 Mar 2023 16:13:00 +0800
Subject: [PATCH 11/17] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E5=8D=95=E6=B5=8B?=
 =?UTF-8?q?=E6=96=87=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../test_trt_convert_temporal_shift.py         | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
index e3e2a2c3b3876e..49a717535e57eb 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
@@ -68,7 +68,7 @@ def generate_input1(attrs):
                     yield program_config
 
     def sample_predictor_configs(
-            self, program_config
+        self, program_config
     ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [6, 10, 64, 64]}
@@ -92,16 +92,24 @@ def generate_trt_nodes_num(attrs, is_dynamic_shape):
         # for static_shape
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs, False), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs, False), 1e-3
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs, True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(attrs, True), 1e-3
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True
+        ), 1e-3
 
     def test(self):
         self.run_test()

From f163694e12e4efd97ef55b7987068862057408e0 Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Sun, 12 Mar 2023 11:11:55 +0800
Subject: [PATCH 12/17] update temporal_shift_op settings

---
 .../framework/ir/trt_support_nhwc_pass.cc     |  3 +-
 .../fluid/inference/api/analysis_predictor.cc |  2 +-
 paddle/fluid/inference/tensorrt/op_teller.cc  | 31 ++++++++++++++++++-
 3 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc b/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc
index 142bc5d601d89e..86c7b7c9dbbaee 100644
--- a/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc
+++ b/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc
@@ -154,8 +154,7 @@ void TrtSupportNHWCPass::ApplyImpl(Graph *graph) const {
                                                     "bilinear_interp",
                                                     "bilinear_interp_v2",
                                                     "nearest_interp",
-                                                    "nearest_interp_v2",
-                                                    "temporal_shift"};
+                                                    "nearest_interp_v2"};
   // Ops must run under the original layout even though it has
   // data_format/data_layout attribute, otherwise it will be very troublesome!
   std::unordered_set<std::string> must_original_layout_ops{
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 677f331b462ede..ccda587530bfdc 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2546,7 +2546,7 @@ USE_TRT_CONVERTER(grid_sampler)
 #endif
 #if IS_TRT_VERSION_GE(8200)
 USE_TRT_CONVERTER(set_value)
-USE_TRT_CONVERTER(temporal_shift);
+USE_TRT_CONVERTER(temporal_shift)
 #endif
 #if PADDLE_WITH_CUSPARSELT && IS_TRT_VERSION_GE(8000)
 USE_TRT_CONVERTER(sparse_fc)
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 68cdda88b81e75..887b4de9104918 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -2589,6 +2589,35 @@ struct SimpleOpTypeSetTeller : public Teller {
       VLOG(3) << "temporal_shift is not supported when TensorRT < 8.2";
       return false;
 #endif
+
+      if (!with_dynamic_shape) {
+        VLOG(3) << "the temporal shift does not support "
+                   "static shape yet";
+        return false;
+      }
+
+      if (!desc.HasAttr("shift_ratio") || !desc.HasAttr("seg_num")) {
+        VLOG(3) << "temporal shift need attributes : shift_ratio and seg_num";
+        return false;
+      }
+
+      auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
+
+      auto input_name = desc.Input("X")[0];
+      auto* input_desc = block->FindVar(input_name);
+      const auto input_shape = input_desc->GetShape();
+
+      if (input_shape.size() != 4) {
+        VLOG(3) << "The input and grid tensors must be shape tensors of rank 4 "
+                   "using TRT TemporalShift layer.";
+        return false;
+      }
     }
 
     if (use_no_calib_int8) {
@@ -2907,7 +2936,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "fuse_eleadd_transpose",
       "skip_groupnorm_act",
       "preln_groupnorm_act",
-      "temporal_shift"
+      "temporal_shift",
       "grid_sampler"};
 };
 

From e9dee6a3fe8b6fde71b2aada0a53e8f1112089be Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Sun, 12 Mar 2023 11:12:31 +0800
Subject: [PATCH 13/17] fix bugs about temporal_shift

---
 .../tensorrt/convert/temporal_shift_op.cc     | 112 +++++++++++-------
 1 file changed, 70 insertions(+), 42 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc b/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
index 651a2b04df89d9..eb5ed05c57a33b 100644
--- a/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
@@ -37,7 +37,8 @@ class TemporalShiftOpConverter : public OpConverter {
                   const framework::Scope& scope,
                   bool test_mode) override {
 #if IS_TRT_VERSION_GE(8200)
-    VLOG(3) << "convert a fluid transpose op to tensorrt tranpose layer";
+
+    VLOG(3) << "convert a fluid temporal shift op to tensorrt temporal layer";
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
@@ -62,16 +63,17 @@ class TemporalShiftOpConverter : public OpConverter {
 
     auto input_dims = input->getDimensions();
 
-    const int NT = input_dims.d[0];
     const int C = input_dims.d[1];
     const int H = input_dims.d[2];
     const int W = input_dims.d[3];
-    const int N = NT / T;
+    std::cout << "C: " << C << " H: " << H << " W: " << W
+              << "shift_ratio: " << shift_ratio << " T: " << T << std::endl;
 
     // Reshape input to [N,T,C,H,W]
     auto reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
-    nvinfer1::Dims reshape_dims{5, { N, T, C, H, W }};
+    nvinfer1::Dims reshape_dims{5, { -1, T, C, H, W }};
     reshape_layer->setReshapeDimensions(reshape_dims);
+    input = reshape_layer->getOutput(0);
 
     // Pad input to [N,T+2,C,H,W]
     std::vector<int> pre_pad_v{0, 1, 0, 0, 0};
@@ -101,11 +103,7 @@ class TemporalShiftOpConverter : public OpConverter {
                              nvinfer1::ElementWiseOperation::kSUM)
             ->getOutput(0);
 
-    std::vector<int> input_shape_v(dims, 0);
-    for (int i = 0; i < dims; i++) {
-      input_shape_v[i] = input->getDimensions().d[i];
-    }
-    auto const input_shape = Add1DConstantLayer(input_shape_v);
+    auto const input_shape = Shape(input);
 
     size = TRT_ENGINE_ADD_LAYER(engine_,
                                 ElementWise,
@@ -126,36 +124,67 @@ class TemporalShiftOpConverter : public OpConverter {
                              stride);
     slice_layer->setInput(1, *start);
     slice_layer->setInput(2, *size);
+#if IS_TRT_VERSION_GE(8500)
+    slice_layer->setMode(nvinfer1::SampleMode::kFILL);
+#else
     slice_layer->setMode(nvinfer1::SliceMode::kFILL);
-
+#endif
+    
     // Slice Padded Tensor
-    int slice_c = static_cast<int>(C * shift_ratio);
-    int slice_c2 = static_cast<int>(C * shift_ratio * 2);
-    auto slice_start1 = nvinfer1::Dims{5, { 0, 0, 0, 0, 0 }};
-    auto slice_start2 = nvinfer1::Dims{5, { 0, 2, slice_c, 0, 0 }};
-    auto slice_start3 = nvinfer1::Dims{5, { 0, 1, slice_c2, 0, 0 }};
-    auto slice_size = nvinfer1::Dims{5, { N, T, slice_c, H, W }};
-    auto slice_size2 = nvinfer1::Dims{5, { N, T, C - slice_c2, H, W }};
-    auto slice_stride = nvinfer1::Dims{5, { 1, 1, 1, 1, 1 }};
-
-    auto* slice1_layer = TRT_ENGINE_ADD_LAYER(engine_,
-                                              Slice,
-                                              *slice_layer->getOutput(0),
-                                              slice_start1,
-                                              slice_size,
-                                              slice_stride);
-    auto* slice2_layer = TRT_ENGINE_ADD_LAYER(engine_,
-                                              Slice,
-                                              *slice_layer->getOutput(0),
-                                              slice_start2,
-                                              slice_size,
-                                              slice_stride);
-    auto* slice3_layer = TRT_ENGINE_ADD_LAYER(engine_,
-                                              Slice,
-                                              *slice_layer->getOutput(0),
-                                              slice_start3,
-                                              slice_size2,
-                                              slice_stride);
+    const int slice_c = static_cast<int>(C * shift_ratio);
+    const int slice_c2 = static_cast<int>(C * shift_ratio * 2);
+
+    nvinfer1::ITensor* slice_start1 = Add1DConstantLayer(zeros_v);
+    nvinfer1::ITensor* slice_start2 =
+        Add1DConstantLayer(std::vector<int>{0, 2, slice_c, 0, 0});
+    nvinfer1::ITensor* slice_start3 =
+        Add1DConstantLayer(std::vector<int>{0, 1, slice_c2, 0, 0});
+    
+    nvinfer1::ITensor* slice_size_base = Shape(input);
+    nvinfer1::ITensor* sub_size1 =
+        Add1DConstantLayer(std::vector<int>{0, 0, C - slice_c, 0, 0});
+    nvinfer1::ITensor* sub_size2 = Add1DConstantLayer(
+        std::vector<int>{0, 0, C + slice_c - slice_c2, 0, 0});
+    nvinfer1::ITensor* sub_size3 =
+        Add1DConstantLayer(std::vector<int>{0, 0, slice_c2, 0, 0});
+    // [N, T, C, H, W] - [0, 0, C - slice_c, 0, 0] = [N, T, slice_c, H, W]
+    nvinfer1::ITensor* slice_size1 =
+        TRT_ENGINE_ADD_LAYER(engine_,
+                             ElementWise,
+                             *slice_size_base,
+                             *sub_size1,
+                             nvinfer1::ElementWiseOperation::kSUB)
+            ->getOutput(0);
+
+    nvinfer1::ITensor* slice_size2 =
+        TRT_ENGINE_ADD_LAYER(engine_,
+                             ElementWise,
+                             *slice_size_base,
+                             *sub_size2,
+                             nvinfer1::ElementWiseOperation::kSUB)
+            ->getOutput(0);
+    nvinfer1::ITensor* slice_size3 =
+        TRT_ENGINE_ADD_LAYER(engine_,
+                             ElementWise,
+                             *slice_size_base,
+                             *sub_size3,
+                             nvinfer1::ElementWiseOperation::kSUB)
+            ->getOutput(0);
+    
+    auto* slice1_layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Slice, *slice_layer->getOutput(0), dummy, dummy, stride);
+    slice1_layer->setInput(1, *slice_start1);
+    slice1_layer->setInput(2, *slice_size1);
+    
+    auto* slice2_layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Slice, *slice_layer->getOutput(0), dummy, dummy, stride);
+    slice2_layer->setInput(1, *slice_start2);
+    slice2_layer->setInput(2, *slice_size2);
+    
+    auto* slice3_layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Slice, *slice_layer->getOutput(0), dummy, dummy, stride);
+    slice3_layer->setInput(1, *slice_start3);
+    slice3_layer->setInput(2, *slice_size3);
 
     // Concatenate slices along the third dimension (C)
     nvinfer1::IConcatenationLayer* concat_layer;
@@ -173,16 +202,15 @@ class TemporalShiftOpConverter : public OpConverter {
           TRT_ENGINE_ADD_LAYER(engine_, Concatenation, concat_inputs, 3);
       concat_layer->setAxis(2);
     }
-
+    
     // Reshape output to [N*T,C,H,W]
-    nvinfer1::Dims output_shape{4, { N * T, C, H, W }};
     auto* reshape_layer3 =
         TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *concat_layer->getOutput(0));
-    reshape_layer3->setReshapeDimensions(output_shape);
-
+    reshape_layer3->setReshapeDimensions(input_dims);
+    
     // Set output
     auto output_name = op_desc.Output("Out")[0];
-
+    
     if (data_format == "NHWC") {
       // Transpose output to [N*T,C,H,W] -> [N*T,H,W,C]
       auto transpose_layer2 =

From f247f64a18d82fa68c681a1ecfe0b8d9c553f328 Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Sun, 12 Mar 2023 11:14:12 +0800
Subject: [PATCH 14/17] update trt node nums for dynamic mode

---
 .../test_trt_convert_temporal_shift.py        | 40 +++++++++++++++++--
 1 file changed, 36 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
index 49a717535e57eb..e59135b424296b 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
@@ -25,12 +25,31 @@
 
 class TrtConvertTemporalShiftTest(TrtLayerAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        compile_version = paddle_infer.get_trt_compile_version()
+        runtime_version = paddle_infer.get_trt_runtime_version()
+        if (
+            compile_version[0] * 1000
+            + compile_version[1] * 100
+            + compile_version[2] * 10
+            < 8200
+        ):
+            return False
+        if (
+            runtime_version[0] * 1000
+            + runtime_version[1] * 100
+            + runtime_version[2] * 10
+            < 8200
+        ):
+            return False
         return True
 
     def sample_program_configs(self):
         def generate_input1(attrs):
             T = attrs[0]["seg_num"]
-            return np.random.rand(3 * T, 10, 64, 64).astype(np.float32)
+            shape = [2 * T, 10, 64, 64]
+            return np.random.uniform(low=0.1, high=1.0, size=shape).astype(
+                np.float32
+            )
 
         for shift_value in [0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.49]:
             for T in range(2, 5):
@@ -71,11 +90,16 @@ def sample_predictor_configs(
         self, program_config
     ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
-            self.dynamic_shape.min_input_shape = {"input_data": [6, 10, 64, 64]}
+            t = attrs[0]['seg_num']
+            self.dynamic_shape.min_input_shape = {
+                "input_data": [2 * t, 10, 64, 64]
+            }
             self.dynamic_shape.max_input_shape = {
-                "input_data": [20, 10, 64, 64]
+                "input_data": [5 * t, 10, 64, 64]
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input_data": [3 * t, 10, 64, 64]
             }
-            self.dynamic_shape.opt_input_shape = {"input_data": [6, 10, 64, 64]}
 
         def clear_dynamic_shape():
             self.dynamic_shape.max_input_shape = {}
@@ -83,6 +107,14 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         def generate_trt_nodes_num(attrs, is_dynamic_shape):
+            valid_version = (8, 2, 0)
+            compile_version = paddle_infer.get_trt_compile_version()
+            runtime_version = paddle_infer.get_trt_runtime_version()
+            self.assertTrue(compile_version == runtime_version)
+            if compile_version < valid_version:
+                return 0, 3
+            if is_dynamic_shape:
+                return 1, 2
             return 0, 3
 
         attrs = [

From 90973fc30cfee678d2946307caaaf78c80606379 Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Sun, 12 Mar 2023 11:15:11 +0800
Subject: [PATCH 15/17] remove std::cout

---
 .../tensorrt/convert/temporal_shift_op.cc      | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc b/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
index eb5ed05c57a33b..157d4a8580c8ae 100644
--- a/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
@@ -66,8 +66,6 @@ class TemporalShiftOpConverter : public OpConverter {
     const int C = input_dims.d[1];
     const int H = input_dims.d[2];
     const int W = input_dims.d[3];
-    std::cout << "C: " << C << " H: " << H << " W: " << W
-              << "shift_ratio: " << shift_ratio << " T: " << T << std::endl;
 
     // Reshape input to [N,T,C,H,W]
     auto reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
@@ -129,7 +127,7 @@ class TemporalShiftOpConverter : public OpConverter {
 #else
     slice_layer->setMode(nvinfer1::SliceMode::kFILL);
 #endif
-    
+
     // Slice Padded Tensor
     const int slice_c = static_cast<int>(C * shift_ratio);
     const int slice_c2 = static_cast<int>(C * shift_ratio * 2);
@@ -139,7 +137,7 @@ class TemporalShiftOpConverter : public OpConverter {
         Add1DConstantLayer(std::vector<int>{0, 2, slice_c, 0, 0});
     nvinfer1::ITensor* slice_start3 =
         Add1DConstantLayer(std::vector<int>{0, 1, slice_c2, 0, 0});
-    
+
     nvinfer1::ITensor* slice_size_base = Shape(input);
     nvinfer1::ITensor* sub_size1 =
         Add1DConstantLayer(std::vector<int>{0, 0, C - slice_c, 0, 0});
@@ -170,17 +168,17 @@ class TemporalShiftOpConverter : public OpConverter {
                              *sub_size3,
                              nvinfer1::ElementWiseOperation::kSUB)
             ->getOutput(0);
-    
+
     auto* slice1_layer = TRT_ENGINE_ADD_LAYER(
         engine_, Slice, *slice_layer->getOutput(0), dummy, dummy, stride);
     slice1_layer->setInput(1, *slice_start1);
     slice1_layer->setInput(2, *slice_size1);
-    
+
     auto* slice2_layer = TRT_ENGINE_ADD_LAYER(
         engine_, Slice, *slice_layer->getOutput(0), dummy, dummy, stride);
     slice2_layer->setInput(1, *slice_start2);
     slice2_layer->setInput(2, *slice_size2);
-    
+
     auto* slice3_layer = TRT_ENGINE_ADD_LAYER(
         engine_, Slice, *slice_layer->getOutput(0), dummy, dummy, stride);
     slice3_layer->setInput(1, *slice_start3);
@@ -202,15 +200,15 @@ class TemporalShiftOpConverter : public OpConverter {
           TRT_ENGINE_ADD_LAYER(engine_, Concatenation, concat_inputs, 3);
       concat_layer->setAxis(2);
     }
-    
+
     // Reshape output to [N*T,C,H,W]
     auto* reshape_layer3 =
         TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *concat_layer->getOutput(0));
     reshape_layer3->setReshapeDimensions(input_dims);
-    
+
     // Set output
     auto output_name = op_desc.Output("Out")[0];
-    
+
     if (data_format == "NHWC") {
       // Transpose output to [N*T,C,H,W] -> [N*T,H,W,C]
       auto transpose_layer2 =

From 7b9ae3fa716c64fbd39a5d6d207ed091df486463 Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Mon, 13 Mar 2023 08:37:05 +0800
Subject: [PATCH 16/17] delete compile version judge in test file

---
 .../inference/test_trt_convert_temporal_shift.py | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
index e59135b424296b..b0b2ce5106213c 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_temporal_shift.py
@@ -25,22 +25,6 @@
 
 class TrtConvertTemporalShiftTest(TrtLayerAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
-        compile_version = paddle_infer.get_trt_compile_version()
-        runtime_version = paddle_infer.get_trt_runtime_version()
-        if (
-            compile_version[0] * 1000
-            + compile_version[1] * 100
-            + compile_version[2] * 10
-            < 8200
-        ):
-            return False
-        if (
-            runtime_version[0] * 1000
-            + runtime_version[1] * 100
-            + runtime_version[2] * 10
-            < 8200
-        ):
-            return False
         return True
 
     def sample_program_configs(self):

From 27d43f830d7484340699a067849f08e30cb93bf3 Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Mon, 13 Mar 2023 11:14:54 +0800
Subject: [PATCH 17/17] remove useless codes

---
 .../tensorrt/convert/temporal_shift_op.cc          | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc b/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
index 157d4a8580c8ae..03983ff3930336 100644
--- a/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/temporal_shift_op.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,16 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
-namespace paddle {
-namespace framework {
-class Scope;
-
-namespace proto {
-class OpDesc;
-}  // namespace proto
-}  // namespace framework
-}  // namespace paddle
-
 namespace paddle {
 namespace inference {
 namespace tensorrt {
@@ -38,7 +28,7 @@ class TemporalShiftOpConverter : public OpConverter {
                   bool test_mode) override {
 #if IS_TRT_VERSION_GE(8200)
 
-    VLOG(3) << "convert a fluid temporal shift op to tensorrt temporal layer";
+    VLOG(3) << "convert a temporal shift op to tensorrt temporal layer";
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);